예제 #1
0
def confusion_matrix_2017_new(y_true, y_pred, labels=None, sample_weight=None):
    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
    if y_type not in ("binary", "multiclass"):
        raise ValueError("%s is not supported" % y_type)

    if labels is None:
        labels = unique_labels(y_true, y_pred)
    else:
        labels = np.asarray(labels)
        if np.all([l not in y_true for l in labels]):
            raise ValueError("At least one label specified must be in y_true")

    if sample_weight is None:
        sample_weight = np.ones(y_true.shape[0], dtype=np.int64)
    else:
        sample_weight = np.asarray(sample_weight)

    check_consistent_length(sample_weight, y_true, y_pred)

    n_labels = labels.size
    # If labels are not consecitive integers starting from zero, then
    # yt, yp must be converted into index form
    need_index_conversion = not (
        labels.dtype.kind in {'i', 'u', 'b'} and
        labels.min() == 0 and np.all(np.diff(labels) == 1) and
        y_true.min() >= 0 and y_pred.min() >= 0
    )
    if need_index_conversion:
        label_to_ind = dict((y, x) for x, y in enumerate(labels))
        y_pred = np.array([label_to_ind.get(x, n_labels + 1) for x in y_pred])
        y_true = np.array([label_to_ind.get(x, n_labels + 1) for x in y_true])

    # eliminate items in y_true, y_pred not in labels
    isvalid = np.logical_and(y_pred < n_labels, y_true < n_labels)
    if not np.all(isvalid):
        y_pred = y_pred[isvalid]
        y_true = y_true[isvalid]
        # also eliminate weights of eliminated items
        sample_weight = sample_weight[isvalid]

    # Choose the accumulator dtype to always have high precision
    if sample_weight.dtype.kind in {'i', 'u', 'b'}:
        dtype = np.int64
    else:
        dtype = np.float64

    CM = coo_matrix((sample_weight, (y_true, y_pred)),
                    shape=(n_labels, n_labels), dtype=dtype,
                    ).toarray()

    return CM
 def quick_cm(y_true, y_pred, labels, sample_weight):
     n_labels = len(labels)
     C = coo_matrix(
         (sample_weight, (y_true, y_pred)), shape=(n_labels, n_labels)
     ).toarray()
     return C
예제 #3
0
def confusion_matrix_2021(y_true, y_pred, *, labels=None, sample_weight=None,
                     normalize=None):
    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
    if y_type not in ("binary", "multiclass"):
        raise ValueError("%s is not supported" % y_type)

    if labels is None:
        labels = unique_labels(y_true, y_pred)
    else:
        labels = np.asarray(labels)
        n_labels = labels.size
        if n_labels == 0:
            raise ValueError("'labels' should contains at least one label.")
        elif y_true.size == 0:
            return np.zeros((n_labels, n_labels), dtype=int)
        elif np.all([l not in y_true for l in labels]):
            raise ValueError("At least one label specified must be in y_true")

    if sample_weight is None:
        sample_weight = np.ones(y_true.shape[0], dtype=np.int64)
    else:
        sample_weight = np.asarray(sample_weight)

    check_consistent_length(y_true, y_pred, sample_weight)

    if normalize not in ['true', 'pred', 'all', None]:
        raise ValueError("normalize must be one of {'true', 'pred', "
                         "'all', None}")

    n_labels = labels.size
    label_to_ind = {y: x for x, y in enumerate(labels)}
    # convert yt, yp into index
    y_pred = np.array([label_to_ind.get(x, n_labels + 1) for x in y_pred])
    y_true = np.array([label_to_ind.get(x, n_labels + 1) for x in y_true])

    # intersect y_pred, y_true with labels, eliminate items not in labels
    ind = np.logical_and(y_pred < n_labels, y_true < n_labels)
    y_pred = y_pred[ind]
    y_true = y_true[ind]
    # also eliminate weights of eliminated items
    sample_weight = sample_weight[ind]

    # Choose the accumulator dtype to always have high precision
    if sample_weight.dtype.kind in {'i', 'u', 'b'}:
        dtype = np.int64
    else:
        dtype = np.float64

    cm = coo_matrix((sample_weight, (y_true, y_pred)),
                    shape=(n_labels, n_labels), dtype=dtype,
                    ).toarray()

    with np.errstate(all='ignore'):
        if normalize == 'true':
            cm = cm / cm.sum(axis=1, keepdims=True)
        elif normalize == 'pred':
            cm = cm / cm.sum(axis=0, keepdims=True)
        elif normalize == 'all':
            cm = cm / cm.sum()
        cm = np.nan_to_num(cm)

    return cm
예제 #4
0
def mwe_check_sample_weight():
    import ubelt as ub

    from sklearn.utils.validation import _check_sample_weight
    import numpy as np
    results = []
    ns = np.logspace(1, 6, 100).astype(np.int)
    for n in ub.ProgIter(ns, desc='time-tradeoff', verbose=3):
        print('n = {!r}'.format(n))

        y_true = np.random.randint(0, 100, n).astype(np.int64)
        y_pred = np.random.randint(0, 100, n).astype(np.int64)
        sample_weight = np.random.rand(n)

        import timerit
        ti = timerit.Timerit(9, bestof=3, verbose=2)
        for timer in ti.reset('old-sample-weight-given'):
            with timer:
                np.asarray(sample_weight)
        results.append({
            'n': n,
            'label': ti.label,
            'time': ti.mean(),
        })

        for timer in ti.reset('new-sample-weight-given'):
            with timer:
                _check_sample_weight(sample_weight, y_true, dtype=np.int64)
        results.append({
            'n': n,
            'label': ti.label,
            'time': ti.mean(),
        })

        for timer in ti.reset('old-sample-weight-default'):
            with timer:
                np.ones(y_true.shape[0], dtype=np.int64)
        results.append({
            'n': n,
            'label': ti.label,
            'time': ti.mean(),
        })

        for timer in ti.reset('new-sample-weight-default'):
            with timer:
                _check_sample_weight(None, y_true, dtype=np.int64)
        results.append({
            'n': n,
            'label': ti.label,
            'time': ti.mean(),
        })

    import pandas as pd
    df = pd.DataFrame(results)

    import kwplot
    import seaborn as sns
    kwplot.autoplt()
    sns.set()
    ax = sns.lineplot(data=df, x='n', y='time', hue='label')
    ax.set_yscale('log')
    ax.set_xscale('log')

    from sklearn.utils.validation import _check_sample_weight
    import numpy as np
    results = []
    ns = np.logspace(1, 6, 100).astype(np.int)
    for n in ub.ProgIter(ns, desc='time-tradeoff', verbose=3):
        print('n = {!r}'.format(n))

        n_labels = 100
        y_true = np.random.randint(0, n_labels, n).astype(np.int64)
        y_pred = np.random.randint(0, n_labels, n).astype(np.int64)

        sample_weight = np.ones(y_true.shape[0], dtype=np.int64)

        for timer in ti.reset('use-old-uint8-sample-weight-default'):
            with timer:
                if sample_weight.dtype.kind in {'i', 'u', 'b'}:
                    dtype = np.int64
                else:
                    dtype = np.float64
                cm = coo_matrix((sample_weight, (y_true, y_pred)),
                                shape=(n_labels, n_labels), dtype=dtype,
                                ).toarray()
        results.append({
            'n': n,
            'label': ti.label,
            'time': ti.mean(),
        })

        sample_weight = _check_sample_weight(None, y_true, dtype=np.int64)
        for timer in ti.reset('use-new-float64-sample-weight-default'):
            with timer:
                if sample_weight.dtype.kind in {'i', 'u', 'b'}:
                    dtype = np.int64
                else:
                    dtype = np.float64
                cm = coo_matrix((sample_weight, (y_true, y_pred)),
                                shape=(n_labels, n_labels), dtype=dtype,
                                ).toarray()
        results.append({
            'n': n,
            'label': ti.label,
            'time': ti.mean(),
        })
    import pandas as pd
    df = pd.DataFrame(results)

    import kwplot
    import seaborn as sns
    kwplot.autoplt()
    sns.set()
    ax = sns.lineplot(data=df, x='n', y='time', hue='label')
    ax.set_yscale('log')
    ax.set_xscale('log')