예제 #1
0
def log_ks_statistic(y_true,
                     y_pred,
                     experiment=None,
                     channel_name='metric_charts',
                     prefix=''):
    """Creates and logs KS statistics curve and KS statistics score to Neptune.

    Kolmogorov-Smirnov statistics chart can be calculated for true positive rates (TPR) and true negative rates (TNR)
    for each threshold and plotted on a chart.
    The maximum distance from TPR to TNR can be treated as performance metric.

    Args:
        y_true (array-like, shape (n_samples)): Ground truth (correct) target values.
        y_pred (array-like, shape (n_samples, 2)): Predictions for classes 0 and 1 with values from 0 to 1.
        experiment(`neptune.experiments.Experiment`): Neptune experiment. Default is None.
        channel_name(str): name of the neptune channel. Default is 'metric_charts'.
        prefix(str): Prefix that will be added before metric name when logged to Neptune.

    Examples:
        Train the model and make predictions on test::

            from sklearn.datasets import make_classification
            from sklearn.ensemble import RandomForestClassifier
            from sklearn.model_selection import train_test_split
            from sklearn.metrics import classification_report

            X, y = make_classification(n_samples=2000)
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

            model = RandomForestClassifier()
            model.fit(X_train, y_train)

            y_test_pred = model.predict_proba(X_test)

        Create and log KS statistics curve and KS statistics score to Neptune::

            import neptune
            from neptunecontrib.monitoring.metrics import log_ks_statistic

            neptune.init()
            with neptune.create_experiment():
                log_ks_statistic(y_test, y_test_pred)

        Check out this experiment https://ui.neptune.ai/o/neptune-ai/org/binary-classification-metrics/e/BIN-101/logs.

    """
    assert len(
        y_pred.shape
    ) == 2, 'y_pred needs to be (n_samples, 2), use expand_prediction helper to format it'

    _exp = experiment if experiment else neptune

    res = binary_ks_curve(y_true, y_pred[:, 1])
    ks_stat = res[3]
    _exp.log_metric(prefix + 'ks_statistic', ks_stat)

    fig, ax = plt.subplots()
    plt_metrics.plot_ks_statistic(y_true, y_pred, ax=ax)
    send_figure(fig, channel_name=prefix + channel_name, experiment=_exp)
    plt.close()
예제 #2
0
def plot_ks_statistic(y_true, y_probas, title='KS Statistic Plot', ax=None):
    """Generates the KS Statistic plot for a set of ground truth labels and classifier probability predictions.

    Args:
        y_true (array-like, shape (n_samples)):
            Ground truth (correct) target values.

        y_probas (array-like, shape (n_samples, n_classes)):
            Prediction probabilities for each class returned by a classifier.

        title (string, optional): Title of the generated plot. Defaults to "KS Statistic Plot".

        ax (:class:`matplotlib.axes.Axes`, optional): The axes upon which to plot
            the learning curve. If None, the plot is drawn on a new set of axes.

    Returns:
        ax (:class:`matplotlib.axes.Axes`): The axes on which the plot was drawn.

    Example:
        >>> import scikitplot.plotters as skplt
        >>> lr = LogisticRegression()
        >>> lr = lr.fit(X_train, y_train)
        >>> y_probas = lr.predict_proba(X_test)
        >>> skplt.plot_ks_statistic(y_test, y_probas)
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fe967d64490>
        >>> plt.show()

        .. image:: _static/examples/plot_ks_statistic.png
           :align: center
           :alt: KS Statistic
    """
    classes = np.unique(y_true)
    if len(classes) != 2:
        raise ValueError('Cannot calculate KS statistic for data with '
                         '{} category/ies'.format(len(classes)))
    probas = y_probas

    # Compute KS Statistic curves
    thresholds, pct1, pct2, ks_statistic, \
        max_distance_at, classes = binary_ks_curve(y_true, probas[:, 1].ravel())

    if ax is None:
        fig, ax = plt.subplots(1, 1)

    ax.set_title(title)

    ax.plot(thresholds, pct1, lw=3, label='Class {}'.format(classes[0]))
    ax.plot(thresholds, pct2, lw=3, label='Class {}'.format(classes[1]))
    idx = np.where(thresholds == max_distance_at)[0][0]
    ax.axvline(max_distance_at,
               *sorted([pct1[idx], pct2[idx]]),
               label='KS Statistic: {:.3f} at {:.3f}'.format(
                   ks_statistic, max_distance_at),
               linestyle=':',
               lw=3,
               color='black')

    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.0])

    ax.set_xlabel('Threshold')
    ax.set_ylabel('Percentage below threshold')
    ax.legend(loc='lower right')

    return ax
예제 #3
0
def ks_abc(y_true, y_pred, ax=None, figsize=None, colors=('darkorange', 'b'), title=None, xlim=(0.,1.), ylim=(0.,1.),
           fmt='.2f', lw=2, legend='best', plot=True, filename=None):
    """
    Perform the Kolmogorov–Smirnov test over the positive and negative distributions of a binary classifier, and compute
    the area between curves.
    The KS test plots the fraction of positives and negatives predicted correctly below each threshold. It then finds
    the optimal threshold, being the one enabling the best class separation.
    The area between curves allows a better insight into separation. The higher the area is (1 being the maximum), the
    more the positive and negative distributions' center-of-mass are closer to 1 and 0, respectively.

    Based on scikit-plot's `plot_ks_statistic` method.

    Parameters:
    -----------
    y_true : array-like
        The true labels of the dataset
    y_pred : array-like
        The probabilities predicted by a binary classifier
    ax : matplotlib ax, default = None
        Matplotlib Axis on which the curves will be plotted
    figsize : (int,int) or None, default = None
        a Matplotlib figure-size tuple. If `None`, falls back to Matplotlib's
        default. Only used if `ax=None`.
    colors : list of Matplotlib color strings, default = ('darkorange', 'b')
        List of colors to be used for the plotted curves.
    title : string or None, default = None
        Plotted graph title. If None, default title is used
    xlim : (float, float), default = (0.,1.)
        X-axis limits.
    ylim : (float,float), default = (0.,1.)
        Y-axis limits.
    fmt : string, default = '.2f'
        String formatting of displayed numbers.
    lw : int, default = 2
        Line-width.
    legend : string or None, default = 'best'
        Position graph legend.
    plot : Boolean, default = True
        Display graph
    filename : string or None, default = None
        If not None, plot will be saved to the given file name

    Returns:
    --------
    A dictionary of the following keys:
    'abc': area between curves,
    'ks_stat': computed statistic of the KS test,
    'eopt': estimated optimal threshold,
    'ax': the ax used to plot the curves
    """
    y_true = convert(y_true, 'array')
    y_pred = convert(y_pred, 'array')
    if y_pred.shape != y_true.shape:
        raise ValueError('y_true and y_pred must have the same shape')
    elif len(y_pred.shape) == 1 or y_pred.shape[1] == 1:
        y_t = y_true
        y_p = y_pred
    elif y_pred.shape[1] == 2:
        y_t = [np.argmax(x) for x in y_true]
        y_p = [x[1] for x in y_pred]
    else:
        raise ValueError('y_true and y_pred must originate from a binary classifier, but have {} columns'.format(y_pred.shape[1]))

    thresholds, nr, pr, ks_statistic, max_distance_at, _ = binary_ks_curve(y_t, y_p)
    if ax is None:
        plt.figure(figsize=figsize)
        ax = plt.gca()

    ax.plot(thresholds, pr, lw=lw, color=colors[0], label='Positive Class')
    ax.plot(thresholds, nr, lw=lw, color=colors[1], label='Negative Class')
    idx = np.where(thresholds == max_distance_at)[0][0]
    ax.axvline(max_distance_at, *sorted([nr[idx], pr[idx]]),
               label='KS Statistic: {ks:{fmt}} at {d:{fmt}}'.format(ks=ks_statistic, d=max_distance_at, fmt=fmt),
               linestyle=':', lw=lw, color='grey')

    thresholds = np.append(thresholds, 1.001)
    abc = 0.
    for i in range(len(pr)):
        abc += (nr[i] - pr[i]) * (thresholds[i + 1] - thresholds[i])

    ax.set_xlim(xlim)
    ax.set_ylim(ylim)
    ax.set_xlabel('Threshold')
    ax.set_ylabel('Percentage below threshold')
    ax.set_title('{t} [ABC = {a:{fmt}}]'.format(t=title or 'KS Statistic Plot', a=abc, fmt=fmt))
    if legend:
        ax.legend(loc='best')
    if filename:
        plt.savefig(filename)
    if plot:
        plt.show()
    return {'abc': abc,
            'ks_stat': ks_statistic,
            'eopt': max_distance_at,
            'ax': ax}
예제 #4
0
def plot_ks_statistic(y_true, y_probas, title='KS Statistic Plot',
                      ax=None, figsize=None, title_fontsize="large",
                      text_fontsize="medium"):
    """Generates the KS Statistic plot from labels and scores/probabilities

    Args:
        y_true (array-like, shape (n_samples)):
            Ground truth (correct) target values.

        y_probas (array-like, shape (n_samples, n_classes)):
            Prediction probabilities for each class returned by a classifier.

        title (string, optional): Title of the generated plot. Defaults to
            "KS Statistic Plot".

        ax (:class:`matplotlib.axes.Axes`, optional): The axes upon which to
            plot the learning curve. If None, the plot is drawn on a new set of
            axes.

        figsize (2-tuple, optional): Tuple denoting figure size of the plot
            e.g. (6, 6). Defaults to ``None``.

        title_fontsize (string or int, optional): Matplotlib-style fontsizes.
            Use e.g. "small", "medium", "large" or integer-values. Defaults to
            "large".

        text_fontsize (string or int, optional): Matplotlib-style fontsizes.
            Use e.g. "small", "medium", "large" or integer-values. Defaults to
            "medium".

    Returns:
        ax (:class:`matplotlib.axes.Axes`): The axes on which the plot was
            drawn.

    Example:
        >>> import scikitplot as skplt
        >>> lr = LogisticRegression()
        >>> lr = lr.fit(X_train, y_train)
        >>> y_probas = lr.predict_proba(X_test)
        >>> skplt.metrics.plot_ks_statistic(y_test, y_probas)
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fe967d64490>
        >>> plt.show()

        .. image:: _static/examples/plot_ks_statistic.png
           :align: center
           :alt: KS Statistic
    """
    y_true = np.array(y_true)
    y_probas = np.array(y_probas)

    classes = np.unique(y_true)
    if len(classes) != 2:
        raise ValueError('Cannot calculate KS statistic for data with '
                         '{} category/ies'.format(len(classes)))
    probas = y_probas

    # Compute KS Statistic curves
    thresholds, pct1, pct2, ks_statistic, \
        max_distance_at, classes = binary_ks_curve(y_true,
                                                   probas[:, 1].ravel())

    if ax is None:
        fig, ax = plt.subplots(1, 1, figsize=figsize)

    ax.set_title(title, fontsize=title_fontsize)

    ax.plot(thresholds, pct1, lw=3, label='Class {}'.format(classes[0]))
    ax.plot(thresholds, pct2, lw=3, label='Class {}'.format(classes[1]))
    idx = np.where(thresholds == max_distance_at)[0][0]
    ax.axvline(max_distance_at, *sorted([pct1[idx], pct2[idx]]),
               label='KS Statistic: {:.3f} at {:.3f}'.format(ks_statistic,
                                                             max_distance_at),
               linestyle=':', lw=3, color='black')

    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.0])

    ax.set_xlabel('Threshold', fontsize=text_fontsize)
    ax.set_ylabel('Percentage below threshold', fontsize=text_fontsize)
    ax.tick_params(labelsize=text_fontsize)
    ax.legend(loc='lower right', fontsize=text_fontsize)

    return ax
예제 #5
0
def plot_ks_statistic(clf,
                      X,
                      y,
                      title='KS Statistic Plot',
                      do_split=True,
                      test_split_ratio=0.33,
                      random_state=None,
                      ax=None):
    """Generates the KS Statistic plot for a given classifier and dataset.

    Args:
        clf: Classifier instance that implements "fit" and "predict_proba" methods.

        X (array-like, shape (n_samples, n_features)):
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.

        y (array-like, shape (n_samples) or (n_samples, n_features)):
            Target relative to X for classification.

        title (string, optional): Title of the generated plot. Defaults to "KS Statistic Plot".

        do_split (bool, optional): If True, the dataset is split into training and testing sets.
            The classifier is trained on the training set and the KS curves are plotted using the
            performance of the classifier on the testing set. If False, the KS curves are generated
            without splitting the dataset or training the classifier. This assumes that the
            classifier has already been called with its `fit` method beforehand.

        test_split_ratio (float, optional): Used when do_split is set to True. Determines the
            proportion of the entire dataset to use in the testing split. Default is set to 0.33.

        random_state (int :class:`RandomState`): Pseudo-random number generator state used
            for random sampling.

        ax (:class:`matplotlib.axes.Axes`, optional): The axes upon which to plot
            the learning curve. If None, the plot is drawn on a new set of axes.

    Returns:
        ax (:class:`matplotlib.axes.Axes`): The axes on which the plot was drawn.

    Example:
            >>> lr = classifier_factory(LogisticRegression())
            >>> lr.plot_ks_statistic(X, y, random_state=1)
            <matplotlib.axes._subplots.AxesSubplot object at 0x7fe967d64490>
            >>> plt.show()

        .. image:: _static/examples/plot_ks_statistic.png
           :align: center
           :alt: KS Statistic
    """
    if not hasattr(clf, 'predict_proba'):
        raise TypeError(
            '"predict_proba" method not in classifier. Cannot calculate ROC Curve.'
        )

    if not do_split:
        if len(clf.classes_) != 2:
            raise ValueError('Cannot calculate KS statistic for data with '
                             '{} category/ies'.format(len(clf.classes_)))
        probas = clf.predict_proba(X)
        y_true = y

    else:
        X_train, X_test, y_train, y_test = train_test_split(
            X,
            y,
            test_size=test_split_ratio,
            stratify=y,
            random_state=random_state)
        clf_clone = clone(clf)
        clf_clone.fit(X_train, y_train)
        if len(clf_clone.classes_) != 2:
            raise ValueError('Cannot calculate KS statistic for data with '
                             '{} category/ies'.format(len(clf_clone.classes_)))
        probas = clf_clone.predict_proba(X_test)
        y_true = y_test

    # Compute KS Statistic curves
    thresholds, pct1, pct2, ks_statistic, \
        max_distance_at, classes = binary_ks_curve(y_true, probas[:, 1].ravel())

    if ax is None:
        fig, ax = plt.subplots(1, 1)

    ax.set_title(title)

    ax.plot(thresholds, pct1, lw=3, label='Class {}'.format(classes[0]))
    ax.plot(thresholds, pct2, lw=3, label='Class {}'.format(classes[1]))
    idx = np.where(thresholds == max_distance_at)[0][0]
    ax.axvline(max_distance_at,
               *sorted([pct1[idx], pct2[idx]]),
               label='KS Statistic: {:.3f} at {:.3f}'.format(
                   ks_statistic, max_distance_at),
               linestyle=':',
               lw=3,
               color='black')

    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.0])

    ax.set_xlabel('Threshold')
    ax.set_ylabel('Percentage below threshold')
    ax.legend(loc='lower right')

    return ax