def test_label_binarizer_set_label_encoding(): lb = LabelBinarizer(neg_label=-2, pos_label=0) # two-class case with pos_label=0 inp = np.array([0, 1, 1, 0]) expected = np.array([[-2, 0, 0, -2]]).T got = lb.fit_transform(inp) assert_array_equal(expected, got) assert_array_equal(lb.inverse_transform(got), inp) lb = LabelBinarizer(neg_label=-2, pos_label=2) # multi-class case inp = np.array([3, 2, 1, 2, 0]) expected = np.array( [ [-2, -2, -2, +2], [-2, -2, +2, -2], [-2, +2, -2, -2], [-2, -2, +2, -2], [+2, -2, -2, -2], ] ) got = lb.fit_transform(inp) assert_array_equal(expected, got) assert_array_equal(lb.inverse_transform(got), inp)
def test_label_binarizer_errors(): # Check that invalid arguments yield ValueError one_class = np.array([0, 0, 0, 0]) lb = LabelBinarizer().fit(one_class) multi_label = [(2, 3), (0,), (0, 2)] with pytest.raises(ValueError): lb.transform(multi_label) lb = LabelBinarizer() with pytest.raises(ValueError): lb.transform([]) with pytest.raises(ValueError): lb.inverse_transform([]) with pytest.raises(ValueError): LabelBinarizer(neg_label=2, pos_label=1) with pytest.raises(ValueError): LabelBinarizer(neg_label=2, pos_label=2) with pytest.raises(ValueError): LabelBinarizer(neg_label=1, pos_label=2, sparse_output=True) # Fail on y_type with pytest.raises(ValueError): _inverse_binarize_thresholding( y=csr_matrix([[1, 2], [2, 1]]), output_type="foo", classes=[1, 2], threshold=0, ) # Sequence of seq type should raise ValueError y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]] with pytest.raises(ValueError): LabelBinarizer().fit_transform(y_seq_of_seqs) # Fail on the number of classes with pytest.raises(ValueError): _inverse_binarize_thresholding( y=csr_matrix([[1, 2], [2, 1]]), output_type="foo", classes=[1, 2, 3], threshold=0, ) # Fail on the dimension of 'binary' with pytest.raises(ValueError): _inverse_binarize_thresholding( y=np.array([[1, 2, 3], [2, 1, 3]]), output_type="binary", classes=[1, 2, 3], threshold=0, ) # Fail on multioutput data with pytest.raises(ValueError): LabelBinarizer().fit(np.array([[1, 3], [2, 1]])) with pytest.raises(ValueError): label_binarize(np.array([[1, 3], [2, 1]]), classes=[1, 2, 3])
def test_label_binarizer(): # one-class case defaults to negative label # For dense case: inp = ["pos", "pos", "pos", "pos"] lb = LabelBinarizer(sparse_output=False) expected = np.array([[0, 0, 0, 0]]).T got = lb.fit_transform(inp) assert_array_equal(lb.classes_, ["pos"]) assert_array_equal(expected, got) assert_array_equal(lb.inverse_transform(got), inp) # For sparse case: lb = LabelBinarizer(sparse_output=True) got = lb.fit_transform(inp) assert issparse(got) assert_array_equal(lb.classes_, ["pos"]) assert_array_equal(expected, got.toarray()) assert_array_equal(lb.inverse_transform(got.toarray()), inp) lb = LabelBinarizer(sparse_output=False) # two-class case inp = ["neg", "pos", "pos", "neg"] expected = np.array([[0, 1, 1, 0]]).T got = lb.fit_transform(inp) assert_array_equal(lb.classes_, ["neg", "pos"]) assert_array_equal(expected, got) to_invert = np.array([[1, 0], [0, 1], [0, 1], [1, 0]]) assert_array_equal(lb.inverse_transform(to_invert), inp) # multi-class case inp = ["spam", "ham", "eggs", "ham", "0"] expected = np.array([[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]]) got = lb.fit_transform(inp) assert_array_equal(lb.classes_, ['0', 'eggs', 'ham', 'spam']) assert_array_equal(expected, got) assert_array_equal(lb.inverse_transform(got), inp)
def check_binarized_results(y, classes, pos_label, neg_label, expected): for sparse_output in [True, False]: if (pos_label == 0 or neg_label != 0) and sparse_output: with pytest.raises(ValueError): label_binarize( y, classes=classes, neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output, ) continue # check label_binarize binarized = label_binarize( y, classes=classes, neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output, ) assert_array_equal(toarray(binarized), expected) assert issparse(binarized) == sparse_output # check inverse y_type = type_of_target(y) if y_type == "multiclass": inversed = _inverse_binarize_multiclass(binarized, classes=classes) else: inversed = _inverse_binarize_thresholding( binarized, output_type=y_type, classes=classes, threshold=((neg_label + pos_label) / 2.0), ) assert_array_equal(toarray(inversed), toarray(y)) # Check label binarizer lb = LabelBinarizer( neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output ) binarized = lb.fit_transform(y) assert_array_equal(toarray(binarized), expected) assert issparse(binarized) == sparse_output inverse_output = lb.inverse_transform(binarized) assert_array_equal(toarray(inverse_output), toarray(y)) assert issparse(inverse_output) == issparse(y)
def test_label_binarizer_errors(): # Check that invalid arguments yield ValueError one_class = np.array([0, 0, 0, 0]) lb = LabelBinarizer().fit(one_class) multi_label = [(2, 3), (0, ), (0, 2)] err_msg = "You appear to be using a legacy multi-label data representation." with pytest.raises(ValueError, match=err_msg): lb.transform(multi_label) lb = LabelBinarizer() err_msg = "This LabelBinarizer instance is not fitted yet" with pytest.raises(ValueError, match=err_msg): lb.transform([]) with pytest.raises(ValueError, match=err_msg): lb.inverse_transform([]) input_labels = [0, 1, 0, 1] err_msg = "neg_label=2 must be strictly less than pos_label=1." lb = LabelBinarizer(neg_label=2, pos_label=1) with pytest.raises(ValueError, match=err_msg): lb.fit(input_labels) err_msg = "neg_label=2 must be strictly less than pos_label=2." lb = LabelBinarizer(neg_label=2, pos_label=2) with pytest.raises(ValueError, match=err_msg): lb.fit(input_labels) err_msg = ( "Sparse binarization is only supported with non zero pos_label and zero " "neg_label, got pos_label=2 and neg_label=1") lb = LabelBinarizer(neg_label=1, pos_label=2, sparse_output=True) with pytest.raises(ValueError, match=err_msg): lb.fit(input_labels) # Fail on y_type err_msg = "foo format is not supported" with pytest.raises(ValueError, match=err_msg): _inverse_binarize_thresholding( y=csr_matrix([[1, 2], [2, 1]]), output_type="foo", classes=[1, 2], threshold=0, ) # Sequence of seq type should raise ValueError y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]] err_msg = "You appear to be using a legacy multi-label data representation" with pytest.raises(ValueError, match=err_msg): LabelBinarizer().fit_transform(y_seq_of_seqs) # Fail on the number of classes err_msg = "The number of class is not equal to the number of dimension of y." with pytest.raises(ValueError, match=err_msg): _inverse_binarize_thresholding( y=csr_matrix([[1, 2], [2, 1]]), output_type="foo", classes=[1, 2, 3], threshold=0, ) # Fail on the dimension of 'binary' err_msg = "output_type='binary', but y.shape" with pytest.raises(ValueError, match=err_msg): _inverse_binarize_thresholding( y=np.array([[1, 2, 3], [2, 1, 3]]), output_type="binary", classes=[1, 2, 3], threshold=0, ) # Fail on multioutput data err_msg = "Multioutput target data is not supported with label binarization" with pytest.raises(ValueError, match=err_msg): LabelBinarizer().fit(np.array([[1, 3], [2, 1]])) with pytest.raises(ValueError, match=err_msg): label_binarize(np.array([[1, 3], [2, 1]]), classes=[1, 2, 3])