def test_label_binarizer_unseen_labels(): lb = LabelBinarizer() expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) got = lb.fit_transform(['b', 'd', 'e']) assert_array_equal(expected, got) expected = np.array([[0, 0, 0], [1, 0, 0], [0, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 0]]) got = lb.transform(['a', 'b', 'c', 'd', 'e', 'f']) assert_array_equal(expected, got)
def test_label_binarizer_unseen_labels(): lb = LabelBinarizer() expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) got = lb.fit_transform(["b", "d", "e"]) assert_array_equal(expected, got) expected = np.array([[0, 0, 0], [1, 0, 0], [0, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 0]]) got = lb.transform(["a", "b", "c", "d", "e", "f"]) assert_array_equal(expected, got)
def test_label_binarizer_set_label_encoding(): lb = LabelBinarizer(neg_label=-2, pos_label=0) # two-class case with pos_label=0 inp = np.array([0, 1, 1, 0]) expected = np.array([[-2, 0, 0, -2]]).T got = lb.fit_transform(inp) assert_array_equal(expected, got) assert_array_equal(lb.inverse_transform(got), inp) lb = LabelBinarizer(neg_label=-2, pos_label=2) # multi-class case inp = np.array([3, 2, 1, 2, 0]) expected = np.array( [ [-2, -2, -2, +2], [-2, -2, +2, -2], [-2, +2, -2, -2], [-2, -2, +2, -2], [+2, -2, -2, -2], ] ) got = lb.fit_transform(inp) assert_array_equal(expected, got) assert_array_equal(lb.inverse_transform(got), inp)
def check_binarized_results(y, classes, pos_label, neg_label, expected): for sparse_output in [True, False]: if (pos_label == 0 or neg_label != 0) and sparse_output: with pytest.raises(ValueError): label_binarize( y, classes=classes, neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output, ) continue # check label_binarize binarized = label_binarize( y, classes=classes, neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output, ) assert_array_equal(toarray(binarized), expected) assert issparse(binarized) == sparse_output # check inverse y_type = type_of_target(y) if y_type == "multiclass": inversed = _inverse_binarize_multiclass(binarized, classes=classes) else: inversed = _inverse_binarize_thresholding( binarized, output_type=y_type, classes=classes, threshold=((neg_label + pos_label) / 2.0), ) assert_array_equal(toarray(inversed), toarray(y)) # Check label binarizer lb = LabelBinarizer( neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output ) binarized = lb.fit_transform(y) assert_array_equal(toarray(binarized), expected) assert issparse(binarized) == sparse_output inverse_output = lb.inverse_transform(binarized) assert_array_equal(toarray(inverse_output), toarray(y)) assert issparse(inverse_output) == issparse(y)
def test_label_binarizer(): # one-class case defaults to negative label # For dense case: inp = ["pos", "pos", "pos", "pos"] lb = LabelBinarizer(sparse_output=False) expected = np.array([[0, 0, 0, 0]]).T got = lb.fit_transform(inp) assert_array_equal(lb.classes_, ["pos"]) assert_array_equal(expected, got) assert_array_equal(lb.inverse_transform(got), inp) # For sparse case: lb = LabelBinarizer(sparse_output=True) got = lb.fit_transform(inp) assert issparse(got) assert_array_equal(lb.classes_, ["pos"]) assert_array_equal(expected, got.toarray()) assert_array_equal(lb.inverse_transform(got.toarray()), inp) lb = LabelBinarizer(sparse_output=False) # two-class case inp = ["neg", "pos", "pos", "neg"] expected = np.array([[0, 1, 1, 0]]).T got = lb.fit_transform(inp) assert_array_equal(lb.classes_, ["neg", "pos"]) assert_array_equal(expected, got) to_invert = np.array([[1, 0], [0, 1], [0, 1], [1, 0]]) assert_array_equal(lb.inverse_transform(to_invert), inp) # multi-class case inp = ["spam", "ham", "eggs", "ham", "0"] expected = np.array([[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]]) got = lb.fit_transform(inp) assert_array_equal(lb.classes_, ['0', 'eggs', 'ham', 'spam']) assert_array_equal(expected, got) assert_array_equal(lb.inverse_transform(got), inp)
def test_label_binarizer_errors(): # Check that invalid arguments yield ValueError one_class = np.array([0, 0, 0, 0]) lb = LabelBinarizer().fit(one_class) multi_label = [(2, 3), (0, ), (0, 2)] with pytest.raises(ValueError): lb.transform(multi_label) lb = LabelBinarizer() with pytest.raises(ValueError): lb.transform([]) with pytest.raises(ValueError): lb.inverse_transform([]) with pytest.raises(ValueError): LabelBinarizer(neg_label=2, pos_label=1) with pytest.raises(ValueError): LabelBinarizer(neg_label=2, pos_label=2) with pytest.raises(ValueError): LabelBinarizer(neg_label=1, pos_label=2, sparse_output=True) # Fail on y_type with pytest.raises(ValueError): _inverse_binarize_thresholding(y=csr_matrix([[1, 2], [2, 1]]), output_type="foo", classes=[1, 2], threshold=0) # Sequence of seq type should raise ValueError y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]] with pytest.raises(ValueError): LabelBinarizer().fit_transform(y_seq_of_seqs) # Fail on the number of classes with pytest.raises(ValueError): _inverse_binarize_thresholding(y=csr_matrix([[1, 2], [2, 1]]), output_type="foo", classes=[1, 2, 3], threshold=0) # Fail on the dimension of 'binary' with pytest.raises(ValueError): _inverse_binarize_thresholding(y=np.array([[1, 2, 3], [2, 1, 3]]), output_type="binary", classes=[1, 2, 3], threshold=0) # Fail on multioutput data with pytest.raises(ValueError): LabelBinarizer().fit(np.array([[1, 3], [2, 1]])) with pytest.raises(ValueError): label_binarize(np.array([[1, 3], [2, 1]]), [1, 2, 3])
def test_label_binarizer_errors(): # Check that invalid arguments yield ValueError one_class = np.array([0, 0, 0, 0]) lb = LabelBinarizer().fit(one_class) multi_label = [(2, 3), (0, ), (0, 2)] err_msg = "You appear to be using a legacy multi-label data representation." with pytest.raises(ValueError, match=err_msg): lb.transform(multi_label) lb = LabelBinarizer() err_msg = "This LabelBinarizer instance is not fitted yet" with pytest.raises(ValueError, match=err_msg): lb.transform([]) with pytest.raises(ValueError, match=err_msg): lb.inverse_transform([]) input_labels = [0, 1, 0, 1] err_msg = "neg_label=2 must be strictly less than pos_label=1." lb = LabelBinarizer(neg_label=2, pos_label=1) with pytest.raises(ValueError, match=err_msg): lb.fit(input_labels) err_msg = "neg_label=2 must be strictly less than pos_label=2." lb = LabelBinarizer(neg_label=2, pos_label=2) with pytest.raises(ValueError, match=err_msg): lb.fit(input_labels) err_msg = ( "Sparse binarization is only supported with non zero pos_label and zero " "neg_label, got pos_label=2 and neg_label=1") lb = LabelBinarizer(neg_label=1, pos_label=2, sparse_output=True) with pytest.raises(ValueError, match=err_msg): lb.fit(input_labels) # Fail on y_type err_msg = "foo format is not supported" with pytest.raises(ValueError, match=err_msg): _inverse_binarize_thresholding( y=csr_matrix([[1, 2], [2, 1]]), output_type="foo", classes=[1, 2], threshold=0, ) # Sequence of seq type should raise ValueError y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]] err_msg = "You appear to be using a legacy multi-label data representation" with pytest.raises(ValueError, match=err_msg): LabelBinarizer().fit_transform(y_seq_of_seqs) # Fail on the number of classes err_msg = "The number of class is not equal to the number of dimension of y." with pytest.raises(ValueError, match=err_msg): _inverse_binarize_thresholding( y=csr_matrix([[1, 2], [2, 1]]), output_type="foo", classes=[1, 2, 3], threshold=0, ) # Fail on the dimension of 'binary' err_msg = "output_type='binary', but y.shape" with pytest.raises(ValueError, match=err_msg): _inverse_binarize_thresholding( y=np.array([[1, 2, 3], [2, 1, 3]]), output_type="binary", classes=[1, 2, 3], threshold=0, ) # Fail on multioutput data err_msg = "Multioutput target data is not supported with label binarization" with pytest.raises(ValueError, match=err_msg): LabelBinarizer().fit(np.array([[1, 3], [2, 1]])) with pytest.raises(ValueError, match=err_msg): label_binarize(np.array([[1, 3], [2, 1]]), classes=[1, 2, 3])
def class_report(y_true, y_pred, y_score=None, average='micro'): if y_true.shape != y_pred.shape: print("Error! y_true %s is not the same shape as y_pred %s" % (y_true.shape, y_pred.shape)) return lb = LabelBinarizer() if len(y_true.shape) == 1: lb.fit(y_true) #Value counts of predictions labels, cnt = np.unique(y_pred, return_counts=True) n_classes = len(labels) pred_cnt = pd.Series(cnt, index=labels) metrics_summary = precision_recall_fscore_support(y_true=y_true, y_pred=y_pred, labels=labels) avg = list( precision_recall_fscore_support(y_true=y_true, y_pred=y_pred, average='weighted')) metrics_sum_index = ['precision', 'recall', 'f1-score', 'support'] class_report_df = pd.DataFrame(list(metrics_summary), index=metrics_sum_index, columns=labels) support = class_report_df.loc['support'] total = support.sum() class_report_df['avg / total'] = avg[:-1] + [total] class_report_df = class_report_df.T class_report_df['pred'] = pred_cnt class_report_df['pred'].iloc[-1] = total if not (y_score is None): fpr = dict() tpr = dict() roc_auc = dict() for label_it, label in enumerate(labels): fpr[label], tpr[label], _ = roc_curve( (y_true == label).astype(int), y_score[:, label_it]) roc_auc[label] = auc(fpr[label], tpr[label]) if average == 'micro': if n_classes <= 2: fpr["avg / total"], tpr["avg / total"], _ = roc_curve( lb.transform(y_true).ravel(), y_score[:, 1].ravel()) else: fpr["avg / total"], tpr["avg / total"], _ = roc_curve( lb.transform(y_true).ravel(), y_score.ravel()) roc_auc["avg / total"] = auc(fpr["avg / total"], tpr["avg / total"]) elif average == 'macro': # First aggregate all false positive rates all_fpr = np.unique(np.concatenate([fpr[i] for i in labels])) # Then interpolate all ROC curves at this points mean_tpr = np.zeros_like(all_fpr) for i in labels: mean_tpr += interp(all_fpr, fpr[i], tpr[i]) # Finally average it and compute AUC mean_tpr /= n_classes fpr["macro"] = all_fpr tpr["macro"] = mean_tpr roc_auc["avg / total"] = auc(fpr["macro"], tpr["macro"]) class_report_df['AUC'] = pd.Series(roc_auc) return class_report_df