示例#1
0
def explain_prediction(pipeline, input_features, top_k=3, training_data=None, include_shap_values=False,
                       output_format="text"):
    """Creates table summarizing the top_k positive and top_k negative contributing features to the prediction of a single datapoint.

    XGBoost models and CatBoost multiclass classifiers are not currently supported.

    Arguments:
        pipeline (PipelineBase): Fitted pipeline whose predictions we want to explain with SHAP.
        input_features (ww.DataTable, pd.DataFrame): Dataframe of features - needs to correspond to data the pipeline was fit on.
        top_k (int): How many of the highest/lowest features to include in the table.
        training_data (pd.DataFrame): Training data the pipeline was fit on.
            This is required for non-tree estimators because we need a sample of training data for the KernelSHAP algorithm.
        include_shap_values (bool): Whether the SHAP values should be included in an extra column in the output.
            Default is False.
        output_format (str): Either "text" or "dict". Default is "text".

    Returns:
        str or dict - A report explaining the most positive/negative contributing features to the predictions.
    """
    input_features = _convert_to_woodwork_structure(input_features)
    if not (isinstance(input_features, ww.DataTable) and input_features.shape[0] == 1):
        raise ValueError("features must be stored in a dataframe or datatable with exactly one row.")
    input_features = _convert_woodwork_types_wrapper(input_features.to_dataframe())
    if training_data is not None:
        training_data = _convert_to_woodwork_structure(training_data)
        training_data = _convert_woodwork_types_wrapper(training_data.to_dataframe())

    if output_format not in {"text", "dict", "dataframe"}:
        raise ValueError(f"Parameter output_format must be either text, dict, or dataframe. Received {output_format}")
    return _make_single_prediction_shap_table(pipeline, input_features, top_k, training_data, include_shap_values,
                                              output_format=output_format)
示例#2
0
def precision_recall_curve(y_true, y_pred_proba):
    """
    Given labels and binary classifier predicted probabilities, compute and return the data representing a precision-recall curve.

    Arguments:
        y_true (ww.DataColumn, pd.Series or np.ndarray): True binary labels.
        y_pred_proba (ww.DataColumn, pd.Series or np.ndarray): Predictions from a binary classifier, before thresholding has been applied. Note this should be the predicted probability for the "true" label.

    Returns:
        list: Dictionary containing metrics used to generate a precision-recall plot, with the following keys:

                  * `precision`: Precision values.
                  * `recall`: Recall values.
                  * `thresholds`: Threshold values used to produce the precision and recall.
                  * `auc_score`: The area under the ROC curve.
    """
    y_true = _convert_to_woodwork_structure(y_true)
    y_pred_proba = _convert_to_woodwork_structure(y_pred_proba)
    y_true = _convert_woodwork_types_wrapper(y_true.to_series())
    y_pred_proba = _convert_woodwork_types_wrapper(y_pred_proba.to_series())

    precision, recall, thresholds = sklearn_precision_recall_curve(
        y_true, y_pred_proba)
    auc_score = sklearn_auc(recall, precision)
    return {
        'precision': precision,
        'recall': recall,
        'thresholds': thresholds,
        'auc_score': auc_score
    }
示例#3
0
    def _fit_transform_features_helper(self, needs_fitting, X, y=None):
        """Helper function that transforms the input data based on the component graph components.

        Arguments:
            needs_fitting (boolean): Determines if components should be fit.
            X (ww.DataTable, pd.DataFrame): Data of shape [n_samples, n_features]
            y (ww.DataColumn, pd.Series): The target training data of length [n_samples]. Defaults to None.

        Returns:
            ww.DataTable: Transformed values.
        """
        if len(self.compute_order) <= 1:
            return _convert_to_woodwork_structure(X)
        component_outputs = self._compute_features(self.compute_order[:-1],
                                                   X,
                                                   y=y,
                                                   fit=needs_fitting)
        final_component_inputs = []
        for parent in self.get_parents(self.compute_order[-1]):
            parent_output = component_outputs.get(
                parent, component_outputs.get(f'{parent}.x'))
            if isinstance(parent_output, ww.DataColumn):
                parent_output = parent_output.to_series()
                parent_output = pd.DataFrame(parent_output, columns=[parent])
                parent_output = _convert_to_woodwork_structure(parent_output)
            final_component_inputs.append(parent_output)
        concatted = pd.concat([
            component_input.to_dataframe()
            for component_input in final_component_inputs
        ],
                              axis=1)
        return _convert_to_woodwork_structure(concatted)
示例#4
0
def explain_predictions(pipeline,
                        input_features,
                        training_data=None,
                        top_k_features=3,
                        include_shap_values=False,
                        output_format="text"):
    """Creates a report summarizing the top contributing features for each data point in the input features.

    XGBoost models and CatBoost multiclass classifiers are not currently supported.

    Arguments:
        pipeline (PipelineBase): Fitted pipeline whose predictions we want to explain with SHAP.
        input_features (ww.DataTable, pd.DataFrame): Dataframe of input data to evaluate the pipeline on.
        training_data (ww.DataTable, pd.DataFrame): Dataframe of data the pipeline was fit on. This can be omitted for pipelines
            with tree-based estimators.
        top_k_features (int): How many of the highest/lowest contributing feature to include in the table for each
            data point.
        include_shap_values (bool): Whether SHAP values should be included in the table. Default is False.
        output_format (str): Either "text" or "dict". Default is "text".

    Returns:
        str or dict - A report explaining the top contributing features to each prediction for each row of input_features.
            The report will include the feature names, prediction contribution, and SHAP Value (optional).
    """
    input_features = _convert_to_woodwork_structure(input_features)
    input_features = _convert_woodwork_types_wrapper(
        input_features.to_dataframe())
    if training_data is not None:
        training_data = _convert_to_woodwork_structure(training_data)
        training_data = _convert_woodwork_types_wrapper(
            training_data.to_dataframe())

    if input_features.empty:
        raise ValueError(
            "Parameter input_features must be a non-empty dataframe.")
    if output_format not in {"text", "dict"}:
        raise ValueError(
            f"Parameter output_format must be either text or dict. Received {output_format}"
        )
    data = _ReportData(pipeline,
                       input_features,
                       y_true=None,
                       y_pred=None,
                       y_pred_values=None,
                       errors=None,
                       index_list=range(input_features.shape[0]),
                       metric=None)

    report_creator = _report_creator_factory(
        data,
        report_type="explain_predictions",
        output_format=output_format,
        top_k_features=top_k_features,
        include_shap_values=include_shap_values)
    return report_creator(data)
示例#5
0
def calculate_permutation_importance(pipeline,
                                     X,
                                     y,
                                     objective,
                                     n_repeats=5,
                                     n_jobs=None,
                                     random_state=0):
    """Calculates permutation importance for features.

    Arguments:
        pipeline (PipelineBase or subclass): Fitted pipeline
        X (ww.DataTable, pd.DataFrame): The input data used to score and compute permutation importance
        y (ww.DataColumn, pd.Series): The target data
        objective (str, ObjectiveBase): Objective to score on
        n_repeats (int): Number of times to permute a feature. Defaults to 5.
        n_jobs (int or None): Non-negative integer describing level of parallelism used for pipelines.
            None and 1 are equivalent. If set to -1, all CPUs are used. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used.
        random_state (int, np.random.RandomState): The random seed/state. Defaults to 0.

    Returns:
        Mean feature importance scores over 5 shuffles.
    """
    X = _convert_to_woodwork_structure(X)
    y = _convert_to_woodwork_structure(y)
    X = _convert_woodwork_types_wrapper(X.to_dataframe())
    y = _convert_woodwork_types_wrapper(y.to_series())

    objective = get_objective(objective, return_instance=True)
    if not objective.is_defined_for_problem_type(pipeline.problem_type):
        raise ValueError(
            f"Given objective '{objective.name}' cannot be used with '{pipeline.name}'"
        )

    def scorer(pipeline, X, y):
        scores = pipeline.score(X, y, objectives=[objective])
        return scores[
            objective.
            name] if objective.greater_is_better else -scores[objective.name]

    perm_importance = sk_permutation_importance(pipeline,
                                                X,
                                                y,
                                                n_repeats=n_repeats,
                                                scoring=scorer,
                                                n_jobs=n_jobs,
                                                random_state=random_state)
    mean_perm_importance = perm_importance["importances_mean"]
    if not isinstance(X, pd.DataFrame):
        X = pd.DataFrame(X)
    feature_names = list(X.columns)
    mean_perm_importance = list(zip(feature_names, mean_perm_importance))
    mean_perm_importance.sort(key=lambda x: x[1], reverse=True)
    return pd.DataFrame(mean_perm_importance,
                        columns=["feature", "importance"])
示例#6
0
    def transform(self, X, y=None):
        """Computes the delayed features for all features in X and y.

        For each feature in X, it will add a column to the output dataframe for each
        delay in the (inclusive) range [1, max_delay]. The values of each delayed feature are simply the original
        feature shifted forward in time by the delay amount. For example, a delay of 3 units means that the feature
        value at row n will be taken from the n-3rd row of that feature

        If y is not None, it will also compute the delayed values for the target variable.

        Arguments:
            X (pd.DataFrame or None): Data to transform. None is expected when only the target variable is being used.
            y (pd.Series, None): Target.

        Returns:
            pd.DataFrame: Transformed X.
        """
        if X is None:
            X = pd.DataFrame()
        # Normalize the data into pandas objects
        X = _convert_to_woodwork_structure(X)

        categorical_columns = self._get_categorical_columns(X)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())

        if self.delay_features and len(X) > 0:
            X_categorical = self._encode_X_while_preserving_index(
                X[categorical_columns])
            for col_name in X:
                col = X[col_name]
                if col_name in categorical_columns:
                    col = X_categorical[col_name]
                X = X.assign(
                    **{
                        f"{col_name}_delay_{t}": col.shift(t)
                        for t in range(1, self.max_delay + 1)
                    })

        # Handle cases where the target was passed in
        if self.delay_target and y is not None:
            y = _convert_to_woodwork_structure(y)
            if y.logical_type == logical_types.Categorical:
                y = self._encode_y_while_preserving_index(y)
            else:
                y = _convert_woodwork_types_wrapper(y.to_series())
            X = X.assign(
                **{
                    f"target_delay_{t}": y.shift(t)
                    for t in range(self.start_delay_for_target,
                                   self.max_delay + 1)
                })

        return X
示例#7
0
def roc_curve(y_true, y_pred_proba):
    """
    Given labels and classifier predicted probabilities, compute and return the data representing a Receiver Operating Characteristic (ROC) curve. Works with binary or multiclass problems.

    Arguments:
        y_true (ww.DataColumn, pd.Series or np.ndarray): True labels.
        y_pred_proba (ww.DataColumn, pd.Series or np.ndarray): Predictions from a classifier, before thresholding has been applied.

    Returns:
        list(dict): A list of dictionaries (with one for each class) is returned. Binary classification problems return a list with one dictionary.
            Each dictionary contains metrics used to generate an ROC plot with the following keys:
                  * `fpr_rate`: False positive rate.
                  * `tpr_rate`: True positive rate.
                  * `threshold`: Threshold values used to produce each pair of true/false positive rates.
                  * `auc_score`: The area under the ROC curve.
    """
    y_true = _convert_to_woodwork_structure(y_true)
    y_pred_proba = _convert_to_woodwork_structure(y_pred_proba)
    if isinstance(y_pred_proba, ww.DataTable):
        y_pred_proba = _convert_woodwork_types_wrapper(
            y_pred_proba.to_dataframe()).to_numpy()
    else:
        y_pred_proba = _convert_woodwork_types_wrapper(
            y_pred_proba.to_series()).to_numpy()
    y_true = _convert_woodwork_types_wrapper(y_true.to_series()).to_numpy()

    if len(y_pred_proba.shape) == 1:
        y_pred_proba = y_pred_proba.reshape(-1, 1)
    if y_pred_proba.shape[1] == 2:
        y_pred_proba = y_pred_proba[:, 1].reshape(-1, 1)
    nan_indices = np.logical_or(pd.isna(y_true),
                                np.isnan(y_pred_proba).any(axis=1))
    y_true = y_true[~nan_indices]
    y_pred_proba = y_pred_proba[~nan_indices]

    lb = LabelBinarizer()
    lb.fit(np.unique(y_true))
    y_one_hot_true = lb.transform(y_true)
    n_classes = y_one_hot_true.shape[1]

    curve_data = []
    for i in range(n_classes):
        fpr_rates, tpr_rates, thresholds = sklearn_roc_curve(
            y_one_hot_true[:, i], y_pred_proba[:, i])
        auc_score = sklearn_auc(fpr_rates, tpr_rates)
        curve_data.append({
            'fpr_rates': fpr_rates,
            'tpr_rates': tpr_rates,
            'thresholds': thresholds,
            'auc_score': auc_score
        })

    return curve_data
示例#8
0
    def predict(self, X):
        """Make predictions using selected features.

        Arguments:
            X (ww.DataTable, pd.DataFrame): Data of shape [n_samples, n_features]

        Returns:
            ww.DataColumn: Predicted values.
        """
        if len(self.compute_order) == 0:
            return _convert_to_woodwork_structure(X)
        final_component = self.compute_order[-1]
        outputs = self._compute_features(self.compute_order, X)
        return _convert_to_woodwork_structure(
            outputs.get(final_component, outputs.get(f'{final_component}.x')))
示例#9
0
    def predict(self, X, objective=None):
        """Make predictions using selected features.

        Arguments:
            X (ww.DataTable, pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features]
            objective (Object or string): The objective to use to make predictions

        Returns:
            ww.DataColumn: Predicted values.
        """
        X = _convert_to_woodwork_structure(X)
        predictions = self._component_graph.predict(X)
        predictions_series = predictions.to_series()
        predictions_series.name = self.input_target_name
        return _convert_to_woodwork_structure(predictions_series)
示例#10
0
def normalize_confusion_matrix(conf_mat, normalize_method='true'):
    """Normalizes a confusion matrix.

    Arguments:
        conf_mat (ww.DataTable, pd.DataFrame or np.ndarray): Confusion matrix to normalize.
        normalize_method ({'true', 'pred', 'all'}): Normalization method. Supported options are: 'true' to normalize by row, 'pred' to normalize by column, or 'all' to normalize by all values. Defaults to 'true'.

    Returns:
        pd.DataFrame: normalized version of the input confusion matrix. The column header represents the predicted labels while row header represents the actual labels.
    """
    conf_mat = _convert_to_woodwork_structure(conf_mat)
    conf_mat = _convert_woodwork_types_wrapper(conf_mat.to_dataframe())
    col_names = conf_mat.columns

    conf_mat = conf_mat.to_numpy()
    with warnings.catch_warnings(record=True) as w:
        if normalize_method == 'true':
            conf_mat = conf_mat.astype('float') / conf_mat.sum(
                axis=1)[:, np.newaxis]
        elif normalize_method == 'pred':
            conf_mat = conf_mat.astype('float') / conf_mat.sum(axis=0)
        elif normalize_method == 'all':
            conf_mat = conf_mat.astype('float') / conf_mat.sum().sum()
        else:
            raise ValueError(
                'Invalid value provided for "normalize_method": {}'.format(
                    normalize_method))
        if w and "invalid value encountered in" in str(w[0].message):
            raise ValueError(
                "Sum of given axis is 0 and normalization is not possible. Please select another option."
            )
    conf_mat = pd.DataFrame(conf_mat, index=col_names, columns=col_names)
    return conf_mat
示例#11
0
def test_more_top_n_unique_values_large():
    X = pd.DataFrame({
        "col_1": ["a", "b", "c", "d", "e", "f", "g", "h", "i"],
        "col_2": ["a", "a", "a", "b", "b", "c", "c", "d", "e"],
        "col_3": ["a", "a", "a", "b", "b", "b", "c", "c", "d"],
        "col_4": [2, 0, 1, 3, 0, 1, 2, 4, 1]
    })

    random_seed = 2
    test_random_state = get_random_seed(random_seed)

    encoder = OneHotEncoder(top_n=3, random_state=random_seed)
    encoder.fit(X)
    X_t = encoder.transform(X)

    # Conversion changes the resulting dataframe dtype, resulting in a different random state, so we need make the conversion here too
    X = _convert_to_woodwork_structure(X)
    X = _convert_woodwork_types_wrapper(X.to_dataframe())
    col_1_counts = X["col_1"].value_counts(dropna=False).to_frame()
    col_1_counts = col_1_counts.sample(frac=1, random_state=test_random_state)
    col_1_counts = col_1_counts.sort_values(["col_1"],
                                            ascending=False,
                                            kind='mergesort')
    col_1_samples = col_1_counts.head(
        encoder.parameters['top_n']).index.tolist()
    expected_col_names = set([
        "col_2_a", "col_2_b", "col_2_c", "col_3_a", "col_3_b", "col_3_c",
        "col_4"
    ])
    for val in col_1_samples:
        expected_col_names.add("col_1_" + val)

    col_names = set(X_t.columns)
    assert (col_names == expected_col_names)
示例#12
0
 def transform(self, X, y=None):
     self._provenance = {col: [f"{col}_doubled"] for col in X.columns}
     X = _convert_woodwork_types_wrapper(X.to_dataframe())
     new_X = X.assign(**{f"{col}_doubled": 2 * X.loc[:, col] for col in X.columns})
     if self.drop_old_columns:
         new_X = new_X.drop(columns=X.columns)
     return _convert_to_woodwork_structure(new_X)
示例#13
0
    def fit(self, X, y):
        """Fit each component in the graph

        Arguments:
            X (ww.DataTable, pd.DataFrame): The input training data of shape [n_samples, n_features]
            y (ww.DataColumn, pd.Series): The target training data of length [n_samples]
        """
        X = _convert_to_woodwork_structure(X)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())
        self._compute_features(self.compute_order, X, y, fit=True)
        self._feature_provenance = self._get_feature_provenance(X.columns)
        return self
示例#14
0
def get_prediction_vs_actual_data(y_true, y_pred, outlier_threshold=None):
    """Combines y_true and y_pred into a single dataframe and adds a column for outliers. Used in `graph_prediction_vs_actual()`.

    Arguments:
        y_true (pd.Series, ww.DataColumn, or np.ndarray): The real target values of the data
        y_pred (pd.Series, ww.DataColumn, or np.ndarray): The predicted values outputted by the regression model.
        outlier_threshold (int, float): A positive threshold for what is considered an outlier value. This value is compared to the absolute difference
                                 between each value of y_true and y_pred. Values within this threshold will be blue, otherwise they will be yellow.
                                 Defaults to None

    Returns:
        pd.DataFrame with the following columns:
                * `prediction`: Predicted values from regression model.
                * `actual`: Real target values.
                * `outlier`: Colors indicating which values are in the threshold for what is considered an outlier value.

    """
    if outlier_threshold and outlier_threshold <= 0:
        raise ValueError(
            f"Threshold must be positive! Provided threshold is {outlier_threshold}"
        )

    y_true = _convert_to_woodwork_structure(y_true)
    y_true = _convert_woodwork_types_wrapper(y_true.to_series())
    y_pred = _convert_to_woodwork_structure(y_pred)
    y_pred = _convert_woodwork_types_wrapper(y_pred.to_series())

    predictions = y_pred.reset_index(drop=True)
    actual = y_true.reset_index(drop=True)
    data = pd.concat([pd.Series(predictions), pd.Series(actual)], axis=1)
    data.columns = ['prediction', 'actual']
    if outlier_threshold:
        data['outlier'] = np.where(
            (abs(data['prediction'] - data['actual']) >= outlier_threshold),
            "#ffff00", "#0000ff")
    else:
        data['outlier'] = '#0000ff'
    return data
示例#15
0
def confusion_matrix(y_true, y_predicted, normalize_method='true'):
    """Confusion matrix for binary and multiclass classification.

    Arguments:
        y_true (ww.DataColumn, pd.Series or np.ndarray): True binary labels.
        y_pred (ww.DataColumn, pd.Series or np.ndarray): Predictions from a binary classifier.
        normalize_method ({'true', 'pred', 'all', None}): Normalization method to use, if not None. Supported options are: 'true' to normalize by row, 'pred' to normalize by column, or 'all' to normalize by all values. Defaults to 'true'.

    Returns:
        pd.DataFrame: Confusion matrix. The column header represents the predicted labels while row header represents the actual labels.
    """
    y_true = _convert_to_woodwork_structure(y_true)
    y_predicted = _convert_to_woodwork_structure(y_predicted)
    y_true = _convert_woodwork_types_wrapper(y_true.to_series()).to_numpy()
    y_predicted = _convert_woodwork_types_wrapper(
        y_predicted.to_series()).to_numpy()
    labels = unique_labels(y_true, y_predicted)
    conf_mat = sklearn_confusion_matrix(y_true, y_predicted)
    conf_mat = pd.DataFrame(conf_mat, index=labels, columns=labels)
    if normalize_method is not None:
        return normalize_confusion_matrix(conf_mat,
                                          normalize_method=normalize_method)
    return conf_mat
示例#16
0
    def _consolidate_inputs(x_inputs, y_input, X, y):
        """Combines any/all X and y inputs for a component, including handling defaults

        Arguments:
            x_inputs (list(pd.DataFrame)): Data to be used as X input for a component
            y_input (pd.Series, None): If present, the Series to use as y input for a component, different from the original y
            X (ww.DataTable, pd.DataFrame): The original X input, to be used if there is no parent X input
            y (ww.DataColumn, pd.Series): The original y input, to be used if there is no parent y input

        Returns:
            ww.DataTable, ww.DataColumn: The X and y transformed values to evaluate a component with
        """
        if len(x_inputs) == 0:
            return_x = X
        else:
            return_x = pd.concat(x_inputs, axis=1)
        return_y = y
        if y_input is not None:
            return_y = y_input
        return_x = _convert_to_woodwork_structure(return_x)
        if return_y is not None:
            return_y = _convert_to_woodwork_structure(return_y)
        return return_x, return_y
示例#17
0
def get_prediction_vs_actual_over_time_data(pipeline, X, y, dates):
    """Get the data needed for the prediction_vs_actual_over_time plot.

    Arguments:
        pipeline (TimeSeriesRegressionPipeline): Fitted time series regression pipeline.
        X (ww.DataTable, pd.DataFrame): Features used to generate new predictions.
        y (ww.DataColumn, pd.Series): Target values to compare predictions against.
        dates (ww.DataColumn, pd.Series): Dates corresponding to target values and predictions.

    Returns:
       pd.DataFrame
    """

    dates = _convert_to_woodwork_structure(dates)
    y = _convert_to_woodwork_structure(y)
    prediction = pipeline.predict(X, y)

    dates = _convert_woodwork_types_wrapper(dates.to_series())
    y = _convert_woodwork_types_wrapper(y.to_series())
    return pd.DataFrame({
        "dates": dates.reset_index(drop=True),
        "target": y.reset_index(drop=True),
        "prediction": prediction.reset_index(drop=True)
    })
示例#18
0
def explain_predictions_best_worst(pipeline, input_features, y_true, num_to_explain=5, top_k_features=3,
                                   include_shap_values=False, metric=None, output_format="text"):
    """Creates a report summarizing the top contributing features for the best and worst points in the dataset as measured by error to true labels.

    XGBoost models and CatBoost multiclass classifiers are not currently supported.

    Arguments:
        pipeline (PipelineBase): Fitted pipeline whose predictions we want to explain with SHAP.
        input_features (ww.DataTable, pd.DataFrame): Input data to evaluate the pipeline on.
        y_true (ww.DataColumn, pd.Series): True labels for the input data.
        num_to_explain (int): How many of the best, worst, random data points to explain.
        top_k_features (int): How many of the highest/lowest contributing feature to include in the table for each
            data point.
        include_shap_values (bool): Whether SHAP values should be included in the table. Default is False.
        metric (callable): The metric used to identify the best and worst points in the dataset. Function must accept
            the true labels and predicted value or probabilities as the only arguments and lower values
            must be better. By default, this will be the absolute error for regression problems and cross entropy loss
            for classification problems.
        output_format (str): Either "text" or "dict". Default is "text".

    Returns:
        str or dict - A report explaining the top contributing features for the best/worst predictions in the input_features.
            For each of the best/worst rows of input_features, the predicted values, true labels, metric value,
            feature names, prediction contribution, and SHAP Value (optional) will be listed.
    """
    input_features = _convert_to_woodwork_structure(input_features)
    input_features = _convert_woodwork_types_wrapper(input_features.to_dataframe())
    y_true = _convert_to_woodwork_structure(y_true)
    y_true = _convert_woodwork_types_wrapper(y_true.to_series())

    if not (input_features.shape[0] >= num_to_explain * 2):
        raise ValueError(f"Input features must be a dataframe with more than {num_to_explain * 2} rows! "
                         "Convert to a dataframe and select a smaller value for num_to_explain if you do not have "
                         "enough data.")
    if y_true.shape[0] != input_features.shape[0]:
        raise ValueError("Parameters y_true and input_features must have the same number of data points. Received: "
                         f"true labels: {y_true.shape[0]} and {input_features.shape[0]}")
    if output_format not in {"text", "dict", "dataframe"}:
        raise ValueError(f"Parameter output_format must be either text, dict, or dataframe. Received {output_format}")
    if not metric:
        metric = DEFAULT_METRICS[pipeline.problem_type]

    try:
        if pipeline.problem_type == ProblemTypes.REGRESSION:
            y_pred = pipeline.predict(input_features).to_series()
            y_pred_values = None
            errors = metric(y_true, y_pred)
        else:
            y_pred = pipeline.predict_proba(input_features).to_dataframe()
            y_pred_values = pipeline.predict(input_features).to_series()
            errors = metric(pipeline._encode_targets(y_true), y_pred)
    except Exception as e:
        tb = traceback.format_tb(sys.exc_info()[2])
        raise PipelineScoreError(exceptions={metric.__name__: (e, tb)}, scored_successfully={})

    sorted_scores = errors.sort_values()
    best = sorted_scores.index[:num_to_explain]
    worst = sorted_scores.index[-num_to_explain:]
    index_list = best.tolist() + worst.tolist()

    data = _ReportData(pipeline, input_features, y_true, y_pred, y_pred_values, errors, index_list, metric)

    report_creator = _report_creator_factory(data, report_type="explain_predictions_best_worst",
                                             output_format=output_format, top_k_features=top_k_features,
                                             include_shap_values=include_shap_values, num_to_explain=num_to_explain)
    return report_creator(data)
示例#19
0
def partial_dependence(pipeline, X, feature, grid_resolution=100):
    """Calculates partial dependence.

    Arguments:
        pipeline (PipelineBase or subclass): Fitted pipeline
        X (ww.DataTable, pd.DataFrame, np.ndarray): The input data used to generate a grid of values
            for feature where partial dependence will be calculated at
        feature (int, string): The target features for which to create the partial dependence plot for.
            If feature is an int, it must be the index of the feature to use.
            If feature is a string, it must be a valid column name in X.

    Returns:
        pd.DataFrame: DataFrame with averaged predictions for all points in the grid averaged
            over all samples of X and the values used to calculate those predictions. The dataframe will
            contain two columns: "feature_values" (grid points at which the partial dependence was calculated) and
            "partial_dependence" (the partial dependence at that feature value). For classification problems, there
            will be a third column called "class_label" (the class label for which the partial
            dependence was calculated). For binary classification, the partial dependence is only calculated for the
            "positive" class.

    """
    X = _convert_to_woodwork_structure(X)
    X = _convert_woodwork_types_wrapper(X.to_dataframe())

    if not pipeline._is_fitted:
        raise ValueError(
            "Pipeline to calculate partial dependence for must be fitted")
    if pipeline.model_family == ModelFamily.BASELINE:
        raise ValueError(
            "Partial dependence plots are not supported for Baseline pipelines"
        )
    if isinstance(pipeline, evalml.pipelines.ClassificationPipeline):
        pipeline._estimator_type = "classifier"
    elif isinstance(pipeline, evalml.pipelines.RegressionPipeline):
        pipeline._estimator_type = "regressor"
    pipeline.feature_importances_ = pipeline.feature_importance
    if ((isinstance(feature, int) and X.iloc[:, feature].isnull().sum())
            or (isinstance(feature, str) and X[feature].isnull().sum())):
        warnings.warn(
            "There are null values in the features, which will cause NaN values in the partial dependence output. Fill in these values to remove the NaN values.",
            NullsInColumnWarning)
    try:
        avg_pred, values = sk_partial_dependence(
            pipeline, X=X, features=[feature], grid_resolution=grid_resolution)
    finally:
        # Delete scikit-learn attributes that were temporarily set
        del pipeline._estimator_type
        del pipeline.feature_importances_
    classes = None
    if isinstance(pipeline, evalml.pipelines.BinaryClassificationPipeline):
        classes = [pipeline.classes_[1]]
    elif isinstance(pipeline,
                    evalml.pipelines.MulticlassClassificationPipeline):
        classes = pipeline.classes_

    data = pd.DataFrame({
        "feature_values":
        np.tile(values[0], avg_pred.shape[0]),
        "partial_dependence":
        np.concatenate([pred for pred in avg_pred])
    })
    if classes is not None:
        data['class_label'] = np.repeat(classes, len(values[0]))

    return data
示例#20
0
    def _compute_features(self, component_list, X, y=None, fit=False):
        """Transforms the data by applying the given components.

        Arguments:
            component_list (list): The list of component names to compute.
            X (ww.DataTable, d.DataFrame): Input data to the pipeline to transform.
            y (ww.DataColumn, pd.Series): The target training data of length [n_samples]
            fit (bool): Whether to fit the estimators as well as transform it.
                        Defaults to False.

        Returns:
            dict: Outputs from each component
        """
        X = _convert_to_woodwork_structure(X)
        if len(component_list) == 0:
            return X
        output_cache = {}
        for component_name in component_list:
            component_instance = self.get_component(component_name)
            if not isinstance(component_instance, ComponentBase):
                raise ValueError(
                    'All components must be instantiated before fitting or predicting'
                )
            x_inputs = []
            y_input = None
            for parent_input in self.get_parents(component_name):
                if parent_input[-2:] == '.y':
                    if y_input is not None:
                        raise ValueError(
                            f'Cannot have multiple `y` parents for a single component {component_name}'
                        )
                    y_input = output_cache[parent_input]
                else:
                    parent_x = output_cache.get(
                        parent_input, output_cache.get(f'{parent_input}.x'))
                    if isinstance(parent_x, ww.DataTable):
                        parent_x = _convert_woodwork_types_wrapper(
                            parent_x.to_dataframe())
                    elif isinstance(parent_x, ww.DataColumn):
                        parent_x = pd.Series(_convert_woodwork_types_wrapper(
                            parent_x.to_series()),
                                             name=parent_input)
                    x_inputs.append(parent_x)
            input_x, input_y = self._consolidate_inputs(
                x_inputs, y_input, X, y)
            col_intersection = set(X.columns.keys()).intersection(
                set(input_x.columns.keys()))
            for col in col_intersection:
                if (
                        X[col].logical_type != input_x[col].logical_type
                        and "numeric" not in X[col].semantic_tags
                ):  # numeric is special because we may not be able to safely convert (ex: input is int, output is float)
                    try:
                        input_x = input_x.set_types({col: X[col].logical_type})
                    except TypeError:
                        # if there is a column whose type has been converted s.t. it cannot be converted back, keep as is.
                        # example: StandardScaler could convert a boolean column to a float column. This is expected, and we should not
                        # try to convert back to boolean.
                        continue
            self.input_feature_names.update(
                {component_name: list(input_x.columns)})

            if isinstance(component_instance, Transformer):
                if fit:
                    output = component_instance.fit_transform(input_x, input_y)
                else:
                    output = component_instance.transform(input_x, input_y)
                if isinstance(output, tuple):
                    output_x, output_y = output[0], output[1]
                else:
                    output_x = output
                    output_y = None
                output_cache[f"{component_name}.x"] = output_x
                output_cache[f"{component_name}.y"] = output_y
            else:
                if fit:
                    component_instance.fit(input_x, input_y)
                if not (
                        fit and component_name == self.compute_order[-1]
                ):  # Don't call predict on the final component during fit
                    output = component_instance.predict(input_x)
                else:
                    output = None
                output_cache[component_name] = output
        return output_cache