示例#1
0
def test_label_binarizer_unseen_labels():
    lb = LabelBinarizer()

    expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
    got = lb.fit_transform(['b', 'd', 'e'])
    assert_array_equal(expected, got)

    expected = np.array([[0, 0, 0], [1, 0, 0], [0, 0, 0], [0, 1, 0], [0, 0, 1],
                         [0, 0, 0]])
    got = lb.transform(['a', 'b', 'c', 'd', 'e', 'f'])
    assert_array_equal(expected, got)
示例#2
0
def test_label_binarizer_unseen_labels():
    lb = LabelBinarizer()

    expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
    got = lb.fit_transform(["b", "d", "e"])
    assert_array_equal(expected, got)

    expected = np.array([[0, 0, 0], [1, 0, 0], [0, 0, 0], [0, 1, 0], [0, 0, 1],
                         [0, 0, 0]])
    got = lb.transform(["a", "b", "c", "d", "e", "f"])
    assert_array_equal(expected, got)
示例#3
0
def test_label_binarizer_set_label_encoding():
    lb = LabelBinarizer(neg_label=-2, pos_label=0)

    # two-class case with pos_label=0
    inp = np.array([0, 1, 1, 0])
    expected = np.array([[-2, 0, 0, -2]]).T
    got = lb.fit_transform(inp)
    assert_array_equal(expected, got)
    assert_array_equal(lb.inverse_transform(got), inp)

    lb = LabelBinarizer(neg_label=-2, pos_label=2)

    # multi-class case
    inp = np.array([3, 2, 1, 2, 0])
    expected = np.array(
        [
            [-2, -2, -2, +2],
            [-2, -2, +2, -2],
            [-2, +2, -2, -2],
            [-2, -2, +2, -2],
            [+2, -2, -2, -2],
        ]
    )
    got = lb.fit_transform(inp)
    assert_array_equal(expected, got)
    assert_array_equal(lb.inverse_transform(got), inp)
示例#4
0
def check_binarized_results(y, classes, pos_label, neg_label, expected):
    for sparse_output in [True, False]:
        if (pos_label == 0 or neg_label != 0) and sparse_output:
            with pytest.raises(ValueError):
                label_binarize(
                    y,
                    classes=classes,
                    neg_label=neg_label,
                    pos_label=pos_label,
                    sparse_output=sparse_output,
                )
            continue

        # check label_binarize
        binarized = label_binarize(
            y,
            classes=classes,
            neg_label=neg_label,
            pos_label=pos_label,
            sparse_output=sparse_output,
        )
        assert_array_equal(toarray(binarized), expected)
        assert issparse(binarized) == sparse_output

        # check inverse
        y_type = type_of_target(y)
        if y_type == "multiclass":
            inversed = _inverse_binarize_multiclass(binarized, classes=classes)

        else:
            inversed = _inverse_binarize_thresholding(
                binarized,
                output_type=y_type,
                classes=classes,
                threshold=((neg_label + pos_label) / 2.0),
            )

        assert_array_equal(toarray(inversed), toarray(y))

        # Check label binarizer
        lb = LabelBinarizer(
            neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output
        )
        binarized = lb.fit_transform(y)
        assert_array_equal(toarray(binarized), expected)
        assert issparse(binarized) == sparse_output
        inverse_output = lb.inverse_transform(binarized)
        assert_array_equal(toarray(inverse_output), toarray(y))
        assert issparse(inverse_output) == issparse(y)
示例#5
0
def test_label_binarizer():
    # one-class case defaults to negative label
    # For dense case:
    inp = ["pos", "pos", "pos", "pos"]
    lb = LabelBinarizer(sparse_output=False)
    expected = np.array([[0, 0, 0, 0]]).T
    got = lb.fit_transform(inp)
    assert_array_equal(lb.classes_, ["pos"])
    assert_array_equal(expected, got)
    assert_array_equal(lb.inverse_transform(got), inp)

    # For sparse case:
    lb = LabelBinarizer(sparse_output=True)
    got = lb.fit_transform(inp)
    assert issparse(got)
    assert_array_equal(lb.classes_, ["pos"])
    assert_array_equal(expected, got.toarray())
    assert_array_equal(lb.inverse_transform(got.toarray()), inp)

    lb = LabelBinarizer(sparse_output=False)
    # two-class case
    inp = ["neg", "pos", "pos", "neg"]
    expected = np.array([[0, 1, 1, 0]]).T
    got = lb.fit_transform(inp)
    assert_array_equal(lb.classes_, ["neg", "pos"])
    assert_array_equal(expected, got)

    to_invert = np.array([[1, 0], [0, 1], [0, 1], [1, 0]])
    assert_array_equal(lb.inverse_transform(to_invert), inp)

    # multi-class case
    inp = ["spam", "ham", "eggs", "ham", "0"]
    expected = np.array([[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0],
                         [0, 0, 1, 0], [1, 0, 0, 0]])
    got = lb.fit_transform(inp)
    assert_array_equal(lb.classes_, ['0', 'eggs', 'ham', 'spam'])
    assert_array_equal(expected, got)
    assert_array_equal(lb.inverse_transform(got), inp)
示例#6
0
def test_label_binarizer_errors():
    # Check that invalid arguments yield ValueError
    one_class = np.array([0, 0, 0, 0])
    lb = LabelBinarizer().fit(one_class)

    multi_label = [(2, 3), (0, ), (0, 2)]
    with pytest.raises(ValueError):
        lb.transform(multi_label)

    lb = LabelBinarizer()
    with pytest.raises(ValueError):
        lb.transform([])
    with pytest.raises(ValueError):
        lb.inverse_transform([])

    with pytest.raises(ValueError):
        LabelBinarizer(neg_label=2, pos_label=1)
    with pytest.raises(ValueError):
        LabelBinarizer(neg_label=2, pos_label=2)

    with pytest.raises(ValueError):
        LabelBinarizer(neg_label=1, pos_label=2, sparse_output=True)

    # Fail on y_type
    with pytest.raises(ValueError):
        _inverse_binarize_thresholding(y=csr_matrix([[1, 2], [2, 1]]),
                                       output_type="foo",
                                       classes=[1, 2],
                                       threshold=0)

    # Sequence of seq type should raise ValueError
    y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]]
    with pytest.raises(ValueError):
        LabelBinarizer().fit_transform(y_seq_of_seqs)

    # Fail on the number of classes
    with pytest.raises(ValueError):
        _inverse_binarize_thresholding(y=csr_matrix([[1, 2], [2, 1]]),
                                       output_type="foo",
                                       classes=[1, 2, 3],
                                       threshold=0)

    # Fail on the dimension of 'binary'
    with pytest.raises(ValueError):
        _inverse_binarize_thresholding(y=np.array([[1, 2, 3], [2, 1, 3]]),
                                       output_type="binary",
                                       classes=[1, 2, 3],
                                       threshold=0)

    # Fail on multioutput data
    with pytest.raises(ValueError):
        LabelBinarizer().fit(np.array([[1, 3], [2, 1]]))
    with pytest.raises(ValueError):
        label_binarize(np.array([[1, 3], [2, 1]]), [1, 2, 3])
示例#7
0
def test_label_binarizer_errors():
    # Check that invalid arguments yield ValueError
    one_class = np.array([0, 0, 0, 0])
    lb = LabelBinarizer().fit(one_class)

    multi_label = [(2, 3), (0, ), (0, 2)]
    err_msg = "You appear to be using a legacy multi-label data representation."
    with pytest.raises(ValueError, match=err_msg):
        lb.transform(multi_label)

    lb = LabelBinarizer()
    err_msg = "This LabelBinarizer instance is not fitted yet"
    with pytest.raises(ValueError, match=err_msg):
        lb.transform([])
    with pytest.raises(ValueError, match=err_msg):
        lb.inverse_transform([])

    input_labels = [0, 1, 0, 1]
    err_msg = "neg_label=2 must be strictly less than pos_label=1."
    lb = LabelBinarizer(neg_label=2, pos_label=1)
    with pytest.raises(ValueError, match=err_msg):
        lb.fit(input_labels)
    err_msg = "neg_label=2 must be strictly less than pos_label=2."
    lb = LabelBinarizer(neg_label=2, pos_label=2)
    with pytest.raises(ValueError, match=err_msg):
        lb.fit(input_labels)
    err_msg = (
        "Sparse binarization is only supported with non zero pos_label and zero "
        "neg_label, got pos_label=2 and neg_label=1")
    lb = LabelBinarizer(neg_label=1, pos_label=2, sparse_output=True)
    with pytest.raises(ValueError, match=err_msg):
        lb.fit(input_labels)

    # Fail on y_type
    err_msg = "foo format is not supported"
    with pytest.raises(ValueError, match=err_msg):
        _inverse_binarize_thresholding(
            y=csr_matrix([[1, 2], [2, 1]]),
            output_type="foo",
            classes=[1, 2],
            threshold=0,
        )

    # Sequence of seq type should raise ValueError
    y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]]
    err_msg = "You appear to be using a legacy multi-label data representation"
    with pytest.raises(ValueError, match=err_msg):
        LabelBinarizer().fit_transform(y_seq_of_seqs)

    # Fail on the number of classes
    err_msg = "The number of class is not equal to the number of dimension of y."
    with pytest.raises(ValueError, match=err_msg):
        _inverse_binarize_thresholding(
            y=csr_matrix([[1, 2], [2, 1]]),
            output_type="foo",
            classes=[1, 2, 3],
            threshold=0,
        )

    # Fail on the dimension of 'binary'
    err_msg = "output_type='binary', but y.shape"
    with pytest.raises(ValueError, match=err_msg):
        _inverse_binarize_thresholding(
            y=np.array([[1, 2, 3], [2, 1, 3]]),
            output_type="binary",
            classes=[1, 2, 3],
            threshold=0,
        )

    # Fail on multioutput data
    err_msg = "Multioutput target data is not supported with label binarization"
    with pytest.raises(ValueError, match=err_msg):
        LabelBinarizer().fit(np.array([[1, 3], [2, 1]]))
    with pytest.raises(ValueError, match=err_msg):
        label_binarize(np.array([[1, 3], [2, 1]]), classes=[1, 2, 3])
示例#8
0
def class_report(y_true, y_pred, y_score=None, average='micro'):
    if y_true.shape != y_pred.shape:
        print("Error! y_true %s is not the same shape as y_pred %s" %
              (y_true.shape, y_pred.shape))
        return

    lb = LabelBinarizer()

    if len(y_true.shape) == 1:
        lb.fit(y_true)

    #Value counts of predictions
    labels, cnt = np.unique(y_pred, return_counts=True)
    n_classes = len(labels)
    pred_cnt = pd.Series(cnt, index=labels)

    metrics_summary = precision_recall_fscore_support(y_true=y_true,
                                                      y_pred=y_pred,
                                                      labels=labels)

    avg = list(
        precision_recall_fscore_support(y_true=y_true,
                                        y_pred=y_pred,
                                        average='weighted'))

    metrics_sum_index = ['precision', 'recall', 'f1-score', 'support']
    class_report_df = pd.DataFrame(list(metrics_summary),
                                   index=metrics_sum_index,
                                   columns=labels)

    support = class_report_df.loc['support']
    total = support.sum()
    class_report_df['avg / total'] = avg[:-1] + [total]

    class_report_df = class_report_df.T
    class_report_df['pred'] = pred_cnt
    class_report_df['pred'].iloc[-1] = total

    if not (y_score is None):
        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        for label_it, label in enumerate(labels):
            fpr[label], tpr[label], _ = roc_curve(
                (y_true == label).astype(int), y_score[:, label_it])

            roc_auc[label] = auc(fpr[label], tpr[label])

        if average == 'micro':
            if n_classes <= 2:
                fpr["avg / total"], tpr["avg / total"], _ = roc_curve(
                    lb.transform(y_true).ravel(), y_score[:, 1].ravel())
            else:
                fpr["avg / total"], tpr["avg / total"], _ = roc_curve(
                    lb.transform(y_true).ravel(), y_score.ravel())

            roc_auc["avg / total"] = auc(fpr["avg / total"],
                                         tpr["avg / total"])

        elif average == 'macro':
            # First aggregate all false positive rates
            all_fpr = np.unique(np.concatenate([fpr[i] for i in labels]))

            # Then interpolate all ROC curves at this points
            mean_tpr = np.zeros_like(all_fpr)
            for i in labels:
                mean_tpr += interp(all_fpr, fpr[i], tpr[i])

            # Finally average it and compute AUC
            mean_tpr /= n_classes

            fpr["macro"] = all_fpr
            tpr["macro"] = mean_tpr

            roc_auc["avg / total"] = auc(fpr["macro"], tpr["macro"])

        class_report_df['AUC'] = pd.Series(roc_auc)

    return class_report_df