예제 #1
0
def graph_prediction_vs_actual(y_true, y_pred, outlier_threshold=None):
    """Generate a scatter plot comparing the true and predicted values. Used for regression plotting

    Arguments:
        y_true (ww.DataColumn, pd.Series): The real target values of the data
        y_pred (ww.DataColumn, pd.Series): The predicted values outputted by the regression model.
        outlier_threshold (int, float): A positive threshold for what is considered an outlier value. This value is compared to the absolute difference
                                 between each value of y_true and y_pred. Values within this threshold will be blue, otherwise they will be yellow.
                                 Defaults to None

    Returns:
        plotly.Figure representing the predicted vs. actual values graph

    """
    _go = import_or_raise(
        "plotly.graph_objects",
        error_msg="Cannot find dependency plotly.graph_objects")
    if jupyter_check():
        import_or_raise("ipywidgets", warning=True)

    if outlier_threshold and outlier_threshold <= 0:
        raise ValueError(
            f"Threshold must be positive! Provided threshold is {outlier_threshold}"
        )

    df = get_prediction_vs_actual_data(y_true, y_pred, outlier_threshold)
    data = []

    x_axis = _calculate_axis_range(df['prediction'])
    y_axis = _calculate_axis_range(df['actual'])
    x_y_line = [min(x_axis[0], y_axis[0]), max(x_axis[1], y_axis[1])]
    data.append(
        _go.Scatter(x=x_y_line,
                    y=x_y_line,
                    name="y = x line",
                    line_color='grey'))

    title = 'Predicted vs Actual Values Scatter Plot'
    layout = _go.Layout(title={'text': title},
                        xaxis={
                            'title': 'Prediction',
                            'range': x_y_line
                        },
                        yaxis={
                            'title': 'Actual',
                            'range': x_y_line
                        })

    for color, outlier_group in df.groupby('outlier'):
        if outlier_threshold:
            name = "< outlier_threshold" if color == "#0000ff" else ">= outlier_threshold"
        else:
            name = "Values"
        data.append(
            _go.Scatter(x=outlier_group['prediction'],
                        y=outlier_group['actual'],
                        mode='markers',
                        marker=_go.scatter.Marker(color=color),
                        name=name))
    return _go.Figure(layout=layout, data=data)
예제 #2
0
def graph_permutation_importance(pipeline,
                                 X,
                                 y,
                                 objective,
                                 importance_threshold=0):
    """Generate a bar graph of the pipeline's permutation importance.

    Arguments:
        pipeline (PipelineBase or subclass): Fitted pipeline
        X (ww.DataTable, pd.DataFrame): The input data used to score and compute permutation importance
        y (ww.DataColumn, pd.Series): The target data
        objective (str, ObjectiveBase): Objective to score on
        importance_threshold (float, optional): If provided, graph features with a permutation importance whose absolute value is larger than importance_threshold. Defaults to zero.

    Returns:
        plotly.Figure, a bar graph showing features and their respective permutation importance.
    """
    go = import_or_raise(
        "plotly.graph_objects",
        error_msg="Cannot find dependency plotly.graph_objects")
    if jupyter_check():
        import_or_raise("ipywidgets", warning=True)

    perm_importance = calculate_permutation_importance(pipeline, X, y,
                                                       objective)
    perm_importance['importance'] = perm_importance['importance']

    if importance_threshold < 0:
        raise ValueError(
            f'Provided importance threshold of {importance_threshold} must be greater than or equal to 0'
        )
    # Remove features with close to zero importance
    perm_importance = perm_importance[
        abs(perm_importance['importance']) >= importance_threshold]
    # List is reversed to go from ascending order to descending order
    perm_importance = perm_importance.iloc[::-1]

    title = "Permutation Importance"
    subtitle = "The relative importance of each input feature's "\
               "overall influence on the pipelines' predictions, computed using "\
               "the permutation importance algorithm."
    data = [
        go.Bar(x=perm_importance['importance'],
               y=perm_importance['feature'],
               orientation='h')
    ]

    layout = {
        'title': '{0}<br><sub>{1}</sub>'.format(title, subtitle),
        'height': 800,
        'xaxis_title': 'Permutation Importance',
        'yaxis_title': 'Feature',
        'yaxis': {
            'type': 'category'
        }
    }

    fig = go.Figure(data=data, layout=layout)
    return fig
예제 #3
0
    def graph_feature_importance(self, importance_threshold=0):
        """Generate a bar graph of the pipeline's feature importance

        Arguments:
            importance_threshold (float, optional): If provided, graph features with a permutation importance whose absolute value is larger than importance_threshold. Defaults to zero.

        Returns:
            plotly.Figure, a bar graph showing features and their corresponding importance
        """
        go = import_or_raise(
            "plotly.graph_objects",
            error_msg="Cannot find dependency plotly.graph_objects")
        if jupyter_check():
            import_or_raise("ipywidgets", warning=True)

        feat_imp = self.feature_importance
        feat_imp['importance'] = abs(feat_imp['importance'])

        if importance_threshold < 0:
            raise ValueError(
                f'Provided importance threshold of {importance_threshold} must be greater than or equal to 0'
            )

        # Remove features with importance whose absolute value is less than importance threshold
        feat_imp = feat_imp[feat_imp['importance'] >= importance_threshold]

        # List is reversed to go from ascending order to descending order
        feat_imp = feat_imp.iloc[::-1]

        title = 'Feature Importance'
        subtitle = 'May display fewer features due to feature selection'
        data = [
            go.Bar(x=feat_imp['importance'],
                   y=feat_imp['feature'],
                   orientation='h')
        ]

        layout = {
            'title': '{0}<br><sub>{1}</sub>'.format(title, subtitle),
            'height': 800,
            'xaxis_title': 'Feature Importance',
            'yaxis_title': 'Feature',
            'yaxis': {
                'type': 'category'
            }
        }

        fig = go.Figure(data=data, layout=layout)
        return fig
예제 #4
0
    def __init__(self, data, show_plot=True):
        self._go = import_or_raise(
            "plotly.graph_objects",
            error_msg="Cannot find dependency plotly.graph_objects")

        if jupyter_check():
            import_or_raise("ipywidgets", warning=True)

        self.data = data
        self.best_score_by_iter_fig = None
        self.curr_iteration_scores = list()
        self.best_iteration_scores = list()

        title = 'Pipeline Search: Iteration vs. {}<br><sub>Gray marker indicates the score at current iteration</sub>'.format(
            self.data.objective.name)
        data = [
            self._go.Scatter(x=[],
                             y=[],
                             mode='lines+markers',
                             name='Best Score'),
            self._go.Scatter(x=[],
                             y=[],
                             mode='markers',
                             name='Iter score',
                             marker={'color': 'gray'})
        ]
        layout = {
            'title': title,
            'xaxis': {
                'title': 'Iteration',
                'rangemode': 'tozero'
            },
            'yaxis': {
                'title': 'Score'
            }
        }
        self.best_score_by_iter_fig = self._go.FigureWidget(data, layout)
        self.best_score_by_iter_fig.update_layout(showlegend=False)
        self.update()
예제 #5
0
def graph_binary_objective_vs_threshold(pipeline, X, y, objective, steps=100):
    """Generates a plot graphing objective score vs. decision thresholds for a fitted binary classification pipeline.

    Arguments:
        pipeline (PipelineBase or subclass): Fitted pipeline
        X (ww.DataTable, pd.DataFrame): The input data used to score and compute scores
        y (ww.DataColumn, pd.Series): The target labels
        objective (ObjectiveBase obj, str): Objective used to score, shown on the y-axis of the graph
        steps (int): Number of intervals to divide and calculate objective score at

    Returns:
        plotly.Figure representing the objective score vs. threshold graph generated

    """
    _go = import_or_raise(
        "plotly.graph_objects",
        error_msg="Cannot find dependency plotly.graph_objects")
    if jupyter_check():
        import_or_raise("ipywidgets", warning=True)

    objective = get_objective(objective, return_instance=True)
    df = binary_objective_vs_threshold(pipeline, X, y, objective, steps)
    title = f'{objective.name} Scores vs. Thresholds'
    layout = _go.Layout(
        title={'text': title},
        xaxis={
            'title': 'Threshold',
            'range': _calculate_axis_range(df['threshold'])
        },
        yaxis={
            'title':
            f"{objective.name} Scores vs. Binary Classification Decision Threshold",
            'range': _calculate_axis_range(df['score'])
        })
    data = []
    data.append(
        _go.Scatter(x=df['threshold'], y=df['score'], line=dict(width=3)))
    return _go.Figure(layout=layout, data=data)
예제 #6
0
def graph_precision_recall_curve(y_true, y_pred_proba, title_addition=None):
    """Generate and display a precision-recall plot.

    Arguments:
        y_true (ww.DataColumn, pd.Series or np.ndarray): True binary labels.
        y_pred_proba (ww.DataColumn, pd.Series or np.ndarray): Predictions from a binary classifier, before thresholding has been applied. Note this should be the predicted probability for the "true" label.
        title_addition (str or None): If not None, append to plot title. Default None.

    Returns:
        plotly.Figure representing the precision-recall plot generated
    """
    _go = import_or_raise(
        "plotly.graph_objects",
        error_msg="Cannot find dependency plotly.graph_objects")
    if jupyter_check():
        import_or_raise("ipywidgets", warning=True)
    precision_recall_curve_data = precision_recall_curve(y_true, y_pred_proba)
    title = 'Precision-Recall{}'.format('' if title_addition is None else (
        ' ' + title_addition))
    layout = _go.Layout(title={'text': title},
                        xaxis={
                            'title': 'Recall',
                            'range': [-0.05, 1.05]
                        },
                        yaxis={
                            'title': 'Precision',
                            'range': [-0.05, 1.05]
                        })
    data = []
    data.append(
        _go.Scatter(x=precision_recall_curve_data['recall'],
                    y=precision_recall_curve_data['precision'],
                    name='Precision-Recall (AUC {:06f})'.format(
                        precision_recall_curve_data['auc_score']),
                    line=dict(width=3)))
    return _go.Figure(layout=layout, data=data)
예제 #7
0
def graph_confusion_matrix(y_true,
                           y_pred,
                           normalize_method='true',
                           title_addition=None):
    """Generate and display a confusion matrix plot.

    If `normalize_method` is set, hover text will show raw count, otherwise hover text will show count normalized with method 'true'.

    Arguments:
        y_true (ww.DataColumn, pd.Series or np.ndarray): True binary labels.
        y_pred (ww.DataColumn, pd.Series or np.ndarray): Predictions from a binary classifier.
        normalize_method ({'true', 'pred', 'all', None}): Normalization method to use, if not None. Supported options are: 'true' to normalize by row, 'pred' to normalize by column, or 'all' to normalize by all values. Defaults to 'true'.
        title_addition (str or None): if not None, append to plot title. Defaults to None.

    Returns:
        plotly.Figure representing the confusion matrix plot generated
    """
    _go = import_or_raise(
        "plotly.graph_objects",
        error_msg="Cannot find dependency plotly.graph_objects")
    _ff = import_or_raise(
        "plotly.figure_factory",
        error_msg="Cannot find dependency plotly.figure_factory")
    if jupyter_check():
        import_or_raise("ipywidgets", warning=True)

    conf_mat = confusion_matrix(y_true, y_pred, normalize_method=None)
    conf_mat_normalized = confusion_matrix(y_true,
                                           y_pred,
                                           normalize_method=normalize_method
                                           or 'true')
    labels = conf_mat.columns.tolist()

    title = 'Confusion matrix{}{}'.format(
        '' if title_addition is None else (' ' + title_addition),
        '' if normalize_method is None else
        (', normalized using method "' + normalize_method + '"'))
    z_data, custom_data = (
        conf_mat, conf_mat_normalized) if normalize_method is None else (
            conf_mat_normalized, conf_mat)
    z_data = z_data.to_numpy()
    z_text = [["{:.3f}".format(y) for y in x] for x in z_data]
    primary_heading, secondary_heading = (
        'Raw', 'Normalized') if normalize_method is None else ('Normalized',
                                                               'Raw')
    hover_text = '<br><b>' + primary_heading + ' Count</b>: %{z}<br><b>' + secondary_heading + ' Count</b>: %{customdata} <br>'
    # the "<extra> tags at the end are necessary to remove unwanted trace info
    hover_template = '<b>True</b>: %{y}<br><b>Predicted</b>: %{x}' + hover_text + '<extra></extra>'
    layout = _go.Layout(title={'text': title},
                        xaxis={
                            'title': 'Predicted Label',
                            'type': 'category',
                            'tickvals': labels
                        },
                        yaxis={
                            'title': 'True Label',
                            'type': 'category',
                            'tickvals': labels
                        })
    fig = _ff.create_annotated_heatmap(z_data,
                                       x=labels,
                                       y=labels,
                                       annotation_text=z_text,
                                       customdata=custom_data,
                                       hovertemplate=hover_template,
                                       colorscale='Blues',
                                       showscale=True)
    fig.update_layout(layout)
    # put xaxis text on bottom to not overlap with title
    fig['layout']['xaxis'].update(side='bottom')
    # plotly Heatmap y axis defaults to the reverse of what we want: https://community.plotly.com/t/heatmap-y-axis-is-reversed-by-default-going-against-standard-convention-for-matrices/32180
    fig.update_yaxes(autorange="reversed")
    return fig
예제 #8
0
def graph_partial_dependence(pipeline,
                             X,
                             feature,
                             class_label=None,
                             grid_resolution=100):
    """Create an one-way partial dependence plot.

    Arguments:
        pipeline (PipelineBase or subclass): Fitted pipeline
        X (ww.DataTable, pd.DataFrame, np.ndarray): The input data used to generate a grid of values
            for feature where partial dependence will be calculated at
        feature (int, string): The target feature for which to create the partial dependence plot for.
            If feature is an int, it must be the index of the feature to use.
            If feature is a string, it must be a valid column name in X.
        class_label (string, optional): Name of class to plot for multiclass problems. If None, will plot
            the partial dependence for each class. This argument does not change behavior for regression or binary
            classification pipelines. For binary classification, the partial dependence for the positive label will
            always be displayed. Defaults to None.

    Returns:
        pd.DataFrame: pd.DataFrame with averaged predictions for all points in the grid averaged
            over all samples of X and the values used to calculate those predictions.

    """
    _go = import_or_raise(
        "plotly.graph_objects",
        error_msg="Cannot find dependency plotly.graph_objects")
    if jupyter_check():
        import_or_raise("ipywidgets", warning=True)
    if isinstance(pipeline, evalml.pipelines.MulticlassClassificationPipeline
                  ) and class_label is not None:
        if class_label not in pipeline.classes_:
            msg = f"Class {class_label} is not one of the classes the pipeline was fit on: {', '.join(list(pipeline.classes_))}"
            raise ValueError(msg)

    part_dep = partial_dependence(pipeline,
                                  X,
                                  feature=feature,
                                  grid_resolution=grid_resolution)
    feature_name = str(feature)
    title = f"Partial Dependence of '{feature_name}'"
    layout = _go.Layout(title={'text': title},
                        xaxis={'title': f'{feature_name}'},
                        yaxis={'title': 'Partial Dependence'},
                        showlegend=False)
    if isinstance(pipeline, evalml.pipelines.MulticlassClassificationPipeline):
        class_labels = [class_label
                        ] if class_label is not None else pipeline.classes_
        _subplots = import_or_raise(
            "plotly.subplots",
            error_msg="Cannot find dependency plotly.graph_objects")

        # If the user passes in a value for class_label, we want to create a 1 x 1 subplot or else there would
        # be an empty column in the plot and it would look awkward
        rows, cols = ((len(class_labels) + 1) // 2,
                      2) if len(class_labels) > 1 else (1, len(class_labels))

        # Don't specify share_xaxis and share_yaxis so that we get tickmarks in each subplot
        fig = _subplots.make_subplots(rows=rows,
                                      cols=cols,
                                      subplot_titles=class_labels)
        for i, label in enumerate(class_labels):

            # Plotly trace indexing begins at 1 so we add 1 to i
            fig.add_trace(_go.Scatter(
                x=part_dep.loc[part_dep.class_label == label,
                               'feature_values'],
                y=part_dep.loc[part_dep.class_label == label,
                               'partial_dependence'],
                line=dict(width=3),
                name=label),
                          row=(i + 2) // 2,
                          col=(i % 2) + 1)
        fig.update_layout(layout)
        fig.update_xaxes(title=f'{feature_name}',
                         range=_calculate_axis_range(
                             part_dep['feature_values']))
        fig.update_yaxes(
            range=_calculate_axis_range(part_dep['partial_dependence']))
    else:
        trace = _go.Scatter(x=part_dep['feature_values'],
                            y=part_dep['partial_dependence'],
                            name='Partial Dependence',
                            line=dict(width=3))
        fig = _go.Figure(layout=layout, data=[trace])

    return fig
예제 #9
0
def graph_roc_curve(y_true,
                    y_pred_proba,
                    custom_class_names=None,
                    title_addition=None):
    """Generate and display a Receiver Operating Characteristic (ROC) plot for binary and multiclass classification problems.

    Arguments:
        y_true (ww.DataColumn, pd.Series or np.ndarray): True labels.
        y_pred_proba (ww.DataColumn, pd.Series or np.ndarray): Predictions from a classifier, before thresholding has been applied. Note this should a one dimensional array with the predicted probability for the "true" label in the binary case.
        custom_class_labels (list or None): If not None, custom labels for classes. Default None.
        title_addition (str or None): if not None, append to plot title. Default None.

    Returns:
        plotly.Figure representing the ROC plot generated
    """
    _go = import_or_raise(
        "plotly.graph_objects",
        error_msg="Cannot find dependency plotly.graph_objects")
    if jupyter_check():
        import_or_raise("ipywidgets", warning=True)

    title = 'Receiver Operating Characteristic{}'.format(
        '' if title_addition is None else (' ' + title_addition))
    layout = _go.Layout(title={'text': title},
                        xaxis={
                            'title': 'False Positive Rate',
                            'range': [-0.05, 1.05]
                        },
                        yaxis={
                            'title': 'True Positive Rate',
                            'range': [-0.05, 1.05]
                        })

    all_curve_data = roc_curve(y_true, y_pred_proba)
    graph_data = []

    n_classes = len(all_curve_data)

    if custom_class_names and len(custom_class_names) != n_classes:
        raise ValueError(
            'Number of custom class names does not match number of classes')

    for i in range(n_classes):
        roc_curve_data = all_curve_data[i]
        name = i + 1 if custom_class_names is None else custom_class_names[i]
        graph_data.append(
            _go.Scatter(
                x=roc_curve_data['fpr_rates'],
                y=roc_curve_data['tpr_rates'],
                hovertemplate=
                "(False Postive Rate: %{x}, True Positive Rate: %{y})<br>" +
                "Threshold: %{text}",
                name=f"Class {name} (AUC {roc_curve_data['auc_score']:.06f})",
                text=roc_curve_data["thresholds"],
                line=dict(width=3)))
    graph_data.append(
        _go.Scatter(x=[0, 1],
                    y=[0, 1],
                    name='Trivial Model (AUC 0.5)',
                    line=dict(dash='dash')))
    return _go.Figure(layout=layout, data=graph_data)