Exemplo n.º 1
0
    def predict_proba(self, X_df, give_reasons=False):
        """ Predict probabilities for each class using a fit Ruleset model.

        Parameters
        ----------
        X_df : DataFrame
            Examples to make predictions on.
        give_reasons : bool, default=False
            Whether to also return reasons for each prediction made.

        Returns
        -------
        array<bool>
            Predicted probabilities in order negative, positive probabilities.
            If an example is predicted positive but none of its rules met the required number of proba training examples,
            returns proba of 0 for both classes and issues a warning.

        If give_reasons is True, returns a tuple that contains the above list of predictions
            and a list of the corresponding reasons for each prediction;
            for each positive prediction, gives a list of one-or-more covering Rules, for negative predictions, an empty list.
        """

        # Get proba for all negative predictions
        uncovered_proba = weighted_avg_freqs([self.uncovered_class_freqs])

        # Make predictions for each example
        predictions, covering_rules = self.predict(X_df,
                                                   give_reasons=True,
                                                   warn=False)

        # Calculate probas for each example
        invalid_example_idx = []
        probas = np.empty(shape=(len(predictions), uncovered_proba.shape[0]))
        for i, (p, cr) in enumerate(zip(predictions, covering_rules)):
            if not p:
                probas[i, :] = uncovered_proba
            else:
                # Make sure only using rules that had enough samples to record
                valid_class_freqs = [
                    rule.class_freqs for rule in cr
                    if rule.class_freqs is not None
                ]
                if valid_class_freqs:
                    probas[i, :] = weighted_avg_freqs(valid_class_freqs)
                else:
                    probas[i, :] = 0
                    invalid_example_idx.append(i)

        # Warn if any examples didn't have large enough sample size of any rules
        if invalid_example_idx:
            warning_str = f"Some examples lacked any rule with sufficient sample size to predict_proba: {invalid_example_idx}\n Consider running recalibrate_proba with smaller param min_samples, or set require_min_samples=False"
            _warn(
                warning_str,
                RuntimeWarning,
                filename="base",
                funcname="predict_proba",
            )
        # return probas (and optional extras)
        result = flagged_return([True, give_reasons], [probas, covering_rules])
        return result
 def _set_deprecated_fit_params(self, params):
     """Handle setting parameters passed to .fit that should have been passed to __init__"""
     found_deprecated_params = []
     for param, value in params.items():
         if param in self.VALID_HYPERPARAMETERS:
             found_deprecated_params.append(param)
             setattr(self, param, value)
     if found_deprecated_params:
         _warn(
             f"In the future, you should assign these parameters when initializating classifier instead of during model fitting: {found_deprecated_params}",
             DeprecationWarning,
             "irep/ripper",
             "fit",
         )
Exemplo n.º 3
0
 def _check_allpos_allneg(self, warn=False, warnstack=""):
     """Check if a Ruleset is universal (always predicts pos) or empty (always predicts neg) """
     if self.isuniversal() and warn:
         warning_str = f"Ruleset is universal. All predictions it makes with method .predict will be positive. It may be untrained or was trained on a dataset split lacking negative examples."
         _warn(
             warning_str,
             RuntimeWarning,
             filename="base",
             funcname="_check_allpos_allneg",
             warnstack=warnstack,
         )
     elif self.isnull() and warn:
         warning_str = f"Ruleset is empty. All predictions it makes with method .predict will be negative. It may be untrained or was trained on a dataset split lacking positive examples."
         _warn(
             warning_str,
             RuntimeWarning,
             filename="base",
             funcname="_check_allpos_allneg",
             warnstack=warnstack,
         )
     return self.isuniversal(), self.isnull()
Exemplo n.º 4
0
def recalibrate_proba(ruleset,
                      Xy_df,
                      class_feat,
                      pos_class,
                      min_samples=10,
                      require_min_samples=True):
    """Recalibrate a Ruleset's probability estimations using unseen labeled data without changing the underlying model. May improve .predict_proba generalizability.
    Does not affect the underlying model or which predictions it makes -- only probability estimates. Use params min_samples and require_min_samples to select desired behavior.

    Note1: RunTimeWarning will occur as a reminder when min_samples and require_min_samples params might result in unintended effects.
    Note2: It is possible recalibrating could result in some positive .predict predictions with <0.5 .predict_proba positive probability.

    ruleset : Ruleset
        Ruleset to recalibrate.
    Xy_df : DataFrame
        Labeled dataset.
    class_feat : str
        Name of class feature column in Xy_df.
    pos_class : value, typically str or int
        Positive class value.

    min_samples : int, default=10
        Required minimum number of samples per Rule. Regardless of min_samples, at least one sample of the correct class is always required.
    require_min_samples : bool, default=True
        Halt (with warning) if any Rule lacks the minimum number of samples.
        Setting to False will warn, but still replace Rules probabilities even if the minimum number of samples is not met.
    """

    # At least this many samples per rule (or neg) must be of correct class
    required_correct_samples = 1

    # If not using min_samples, set it to 1
    if not min_samples or min_samples < 1:
        min_samples = 1

    # Collect each Rule's pos and neg frequencies in list "rule_class_freqs"
    # Store rules that lack enough samples in list "insufficient_rules"
    df = Xy_df

    rule_class_freqs = [None] * len(ruleset.rules)
    insufficient_rules = []
    for i, rule in enumerate(ruleset.rules):
        npos_pred = num_pos(rule.covers(df),
                            class_feat=class_feat,
                            pos_class=pos_class)
        nneg_pred = num_neg(rule.covers(df),
                            class_feat=class_feat,
                            pos_class=pos_class)
        neg_pos_pred = (nneg_pred, npos_pred)
        rule_class_freqs[i] = neg_pos_pred
        # Rule has insufficient samples if fewer than minsamples or lacks at least one correct sample
        if (sum(neg_pos_pred) < min_samples or sum(neg_pos_pred) < 1
                or neg_pos_pred[0] < required_correct_samples):
            insufficient_rules.append(rule)

    # Collect class frequencies for negative predictions
    uncovered = df.drop(ruleset.covers(df).index)
    neg_freq = num_neg(uncovered, class_feat=class_feat, pos_class=pos_class)
    tn_fn = (neg_freq, len(uncovered) - neg_freq)

    # Issue warnings if trouble with sample size
    if require_min_samples:
        if insufficient_rules:  # WARN if/which rules lack enough samples
            pretty_insufficient_rules = "\n".join(
                [str(r) for r in insufficient_rules])
            warning_str = f"param min_samples={min_samples}; insufficient number of samples or fewer than {required_correct_samples} correct samples for rules {pretty_insufficient_rules}"
            _warn(
                warning_str,
                RuntimeWarning,
                filename="base_functions",
                funcname="recalibrate_proba",
            )
        if neg_freq < min_samples or tn_fn[
                1] < 1:  # WARN if neg lacks enough samples
            warning_str = f"param min_samples={min_samples}; insufficient number of negatively labled samples"
            _warn(
                warning_str,
                RuntimeWarning,
                filename="base_functions",
                funcname="recalibrate_proba",
            )
        if insufficient_rules or sum(tn_fn) < min_samples:
            if (require_min_samples
                ):  # WARN if require_min_samples -> halting recalibration
                warning_str = f"Recalibrating halted. to recalibrate, try using more samples, lowering min_samples, or set require_min_samples to False"
                _warn(
                    warning_str,
                    RuntimeWarning,
                    filename="base_functions",
                    funcname="recalibrate_proba",
                )
                return
            else:  # GO AHEAD EVEN THOUGH NOT ENOUGH SAMPLES
                pass
                # warning_str = f'Because require_min_samples=False, recalibrating probabilities for any rules with enough samples min_samples>={min_samples} that have at least {required_correct_samples} correct samples even though not all rules have enough samples. Probabilities for any rules that lack enough samples will be retained.'
                # _warn(warning_str, RuntimeWarning, filename='base_functions', funcname='recalibrate_proba')

    # Assign collected frequencies to Rules
    for rule, freqs in zip(ruleset.rules, rule_class_freqs):
        if sum(freqs) >= min_samples and freqs[0] >= required_correct_samples:
            rule.class_freqs = freqs
        else:
            rule.class_freqs = None

    # Assign Ruleset's uncovered frequencies
    if not hasattr(ruleset, "uncovered_class_freqs") or (
            neg_freq >= min_samples and tn_fn[1] >= required_correct_samples):
        ruleset.uncovered_class_freqs = tn_fn

    # Warn if no neg samples
    if (sum([freqs[0] for freqs in rule_class_freqs]) +
            ruleset.uncovered_class_freqs[0] == 0):
        _warn_only_single_class(
            only_value=1,
            pos_class=1,
            filename="base_functions",
            funcname="recalibrate_proba",
        )
    # Warn if no pos samples
    elif (sum([freqs[1] for freqs in rule_class_freqs]) +
          ruleset.uncovered_class_freqs[1] == 0):
        _warn_only_single_class(
            only_value=0,
            pos_class=1,
            filename="base_functions",
            funcname="recalibrate_proba",
        )