def confusion_matrix_2017_new(y_true, y_pred, labels=None, sample_weight=None): y_type, y_true, y_pred = _check_targets(y_true, y_pred) if y_type not in ("binary", "multiclass"): raise ValueError("%s is not supported" % y_type) if labels is None: labels = unique_labels(y_true, y_pred) else: labels = np.asarray(labels) if np.all([l not in y_true for l in labels]): raise ValueError("At least one label specified must be in y_true") if sample_weight is None: sample_weight = np.ones(y_true.shape[0], dtype=np.int64) else: sample_weight = np.asarray(sample_weight) check_consistent_length(sample_weight, y_true, y_pred) n_labels = labels.size # If labels are not consecitive integers starting from zero, then # yt, yp must be converted into index form need_index_conversion = not ( labels.dtype.kind in {'i', 'u', 'b'} and labels.min() == 0 and np.all(np.diff(labels) == 1) and y_true.min() >= 0 and y_pred.min() >= 0 ) if need_index_conversion: label_to_ind = dict((y, x) for x, y in enumerate(labels)) y_pred = np.array([label_to_ind.get(x, n_labels + 1) for x in y_pred]) y_true = np.array([label_to_ind.get(x, n_labels + 1) for x in y_true]) # eliminate items in y_true, y_pred not in labels isvalid = np.logical_and(y_pred < n_labels, y_true < n_labels) if not np.all(isvalid): y_pred = y_pred[isvalid] y_true = y_true[isvalid] # also eliminate weights of eliminated items sample_weight = sample_weight[isvalid] # Choose the accumulator dtype to always have high precision if sample_weight.dtype.kind in {'i', 'u', 'b'}: dtype = np.int64 else: dtype = np.float64 CM = coo_matrix((sample_weight, (y_true, y_pred)), shape=(n_labels, n_labels), dtype=dtype, ).toarray() return CM
def quick_cm(y_true, y_pred, labels, sample_weight): n_labels = len(labels) C = coo_matrix( (sample_weight, (y_true, y_pred)), shape=(n_labels, n_labels) ).toarray() return C
def confusion_matrix_2021(y_true, y_pred, *, labels=None, sample_weight=None, normalize=None): y_type, y_true, y_pred = _check_targets(y_true, y_pred) if y_type not in ("binary", "multiclass"): raise ValueError("%s is not supported" % y_type) if labels is None: labels = unique_labels(y_true, y_pred) else: labels = np.asarray(labels) n_labels = labels.size if n_labels == 0: raise ValueError("'labels' should contains at least one label.") elif y_true.size == 0: return np.zeros((n_labels, n_labels), dtype=int) elif np.all([l not in y_true for l in labels]): raise ValueError("At least one label specified must be in y_true") if sample_weight is None: sample_weight = np.ones(y_true.shape[0], dtype=np.int64) else: sample_weight = np.asarray(sample_weight) check_consistent_length(y_true, y_pred, sample_weight) if normalize not in ['true', 'pred', 'all', None]: raise ValueError("normalize must be one of {'true', 'pred', " "'all', None}") n_labels = labels.size label_to_ind = {y: x for x, y in enumerate(labels)} # convert yt, yp into index y_pred = np.array([label_to_ind.get(x, n_labels + 1) for x in y_pred]) y_true = np.array([label_to_ind.get(x, n_labels + 1) for x in y_true]) # intersect y_pred, y_true with labels, eliminate items not in labels ind = np.logical_and(y_pred < n_labels, y_true < n_labels) y_pred = y_pred[ind] y_true = y_true[ind] # also eliminate weights of eliminated items sample_weight = sample_weight[ind] # Choose the accumulator dtype to always have high precision if sample_weight.dtype.kind in {'i', 'u', 'b'}: dtype = np.int64 else: dtype = np.float64 cm = coo_matrix((sample_weight, (y_true, y_pred)), shape=(n_labels, n_labels), dtype=dtype, ).toarray() with np.errstate(all='ignore'): if normalize == 'true': cm = cm / cm.sum(axis=1, keepdims=True) elif normalize == 'pred': cm = cm / cm.sum(axis=0, keepdims=True) elif normalize == 'all': cm = cm / cm.sum() cm = np.nan_to_num(cm) return cm
def mwe_check_sample_weight(): import ubelt as ub from sklearn.utils.validation import _check_sample_weight import numpy as np results = [] ns = np.logspace(1, 6, 100).astype(np.int) for n in ub.ProgIter(ns, desc='time-tradeoff', verbose=3): print('n = {!r}'.format(n)) y_true = np.random.randint(0, 100, n).astype(np.int64) y_pred = np.random.randint(0, 100, n).astype(np.int64) sample_weight = np.random.rand(n) import timerit ti = timerit.Timerit(9, bestof=3, verbose=2) for timer in ti.reset('old-sample-weight-given'): with timer: np.asarray(sample_weight) results.append({ 'n': n, 'label': ti.label, 'time': ti.mean(), }) for timer in ti.reset('new-sample-weight-given'): with timer: _check_sample_weight(sample_weight, y_true, dtype=np.int64) results.append({ 'n': n, 'label': ti.label, 'time': ti.mean(), }) for timer in ti.reset('old-sample-weight-default'): with timer: np.ones(y_true.shape[0], dtype=np.int64) results.append({ 'n': n, 'label': ti.label, 'time': ti.mean(), }) for timer in ti.reset('new-sample-weight-default'): with timer: _check_sample_weight(None, y_true, dtype=np.int64) results.append({ 'n': n, 'label': ti.label, 'time': ti.mean(), }) import pandas as pd df = pd.DataFrame(results) import kwplot import seaborn as sns kwplot.autoplt() sns.set() ax = sns.lineplot(data=df, x='n', y='time', hue='label') ax.set_yscale('log') ax.set_xscale('log') from sklearn.utils.validation import _check_sample_weight import numpy as np results = [] ns = np.logspace(1, 6, 100).astype(np.int) for n in ub.ProgIter(ns, desc='time-tradeoff', verbose=3): print('n = {!r}'.format(n)) n_labels = 100 y_true = np.random.randint(0, n_labels, n).astype(np.int64) y_pred = np.random.randint(0, n_labels, n).astype(np.int64) sample_weight = np.ones(y_true.shape[0], dtype=np.int64) for timer in ti.reset('use-old-uint8-sample-weight-default'): with timer: if sample_weight.dtype.kind in {'i', 'u', 'b'}: dtype = np.int64 else: dtype = np.float64 cm = coo_matrix((sample_weight, (y_true, y_pred)), shape=(n_labels, n_labels), dtype=dtype, ).toarray() results.append({ 'n': n, 'label': ti.label, 'time': ti.mean(), }) sample_weight = _check_sample_weight(None, y_true, dtype=np.int64) for timer in ti.reset('use-new-float64-sample-weight-default'): with timer: if sample_weight.dtype.kind in {'i', 'u', 'b'}: dtype = np.int64 else: dtype = np.float64 cm = coo_matrix((sample_weight, (y_true, y_pred)), shape=(n_labels, n_labels), dtype=dtype, ).toarray() results.append({ 'n': n, 'label': ti.label, 'time': ti.mean(), }) import pandas as pd df = pd.DataFrame(results) import kwplot import seaborn as sns kwplot.autoplt() sns.set() ax = sns.lineplot(data=df, x='n', y='time', hue='label') ax.set_yscale('log') ax.set_xscale('log')