def test_multilabel_binarizer_non_integer_labels(): tuple_classes = _to_object_array([(1,), (2,), (3,)]) inputs = [ ([('2', '3'), ('1',), ('1', '2')], ['1', '2', '3']), ([('b', 'c'), ('a',), ('a', 'b')], ['a', 'b', 'c']), ([((2,), (3,)), ((1,),), ((1,), (2,))], tuple_classes), ] indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]]) for inp, classes in inputs: # fit_transform() mlb = MultiLabelBinarizer() inp = np.array(inp, dtype=object) assert_array_equal(mlb.fit_transform(inp), indicator_mat) assert_array_equal(mlb.classes_, classes) indicator_mat_inv = np.array(mlb.inverse_transform(indicator_mat), dtype=object) assert_array_equal(indicator_mat_inv, inp) # fit().transform() mlb = MultiLabelBinarizer() assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat) assert_array_equal(mlb.classes_, classes) indicator_mat_inv = np.array(mlb.inverse_transform(indicator_mat), dtype=object) assert_array_equal(indicator_mat_inv, inp) mlb = MultiLabelBinarizer() with pytest.raises(TypeError): mlb.fit_transform([({}), ({}, {'a': 'b'})])
def test_multilabel_binarizer_non_unique(): inp = [(1, 1, 1, 0)] indicator_mat = np.array([[1, 1]]) mlb = MultiLabelBinarizer() assert_array_equal(mlb.fit_transform(inp), indicator_mat)
def test_multilabel_binarizer_given_classes(): inp = [(2, 3), (1,), (1, 2)] indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 0, 1]]) # fit_transform() mlb = MultiLabelBinarizer(classes=[1, 3, 2]) assert_array_equal(mlb.fit_transform(inp), indicator_mat) assert_array_equal(mlb.classes_, [1, 3, 2]) # fit().transform() mlb = MultiLabelBinarizer(classes=[1, 3, 2]) assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat) assert_array_equal(mlb.classes_, [1, 3, 2]) # ensure works with extra class mlb = MultiLabelBinarizer(classes=[4, 1, 3, 2]) assert_array_equal(mlb.fit_transform(inp), np.hstack(([[0], [0], [0]], indicator_mat))) assert_array_equal(mlb.classes_, [4, 1, 3, 2]) # ensure fit is no-op as iterable is not consumed inp = iter(inp) mlb = MultiLabelBinarizer(classes=[1, 3, 2]) assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat) # ensure a ValueError is thrown if given duplicate classes err_msg = "The classes argument contains duplicate classes. Remove " \ "these duplicates before passing them to MultiLabelBinarizer." mlb = MultiLabelBinarizer(classes=[1, 3, 2, 3]) with pytest.raises(ValueError, match=err_msg): mlb.fit(inp)
def test_multilabel_binarizer_inverse_validation(): inp = [(1, 1, 1, 0)] mlb = MultiLabelBinarizer() mlb.fit_transform(inp) # Not binary with pytest.raises(ValueError): mlb.inverse_transform(np.array([[1, 3]])) # The following binary cases are fine, however mlb.inverse_transform(np.array([[0, 0]])) mlb.inverse_transform(np.array([[1, 1]])) mlb.inverse_transform(np.array([[1, 0]])) # Wrong shape with pytest.raises(ValueError): mlb.inverse_transform(np.array([[1]])) with pytest.raises(ValueError): mlb.inverse_transform(np.array([[1, 1, 1]]))
def test_multilabel_binarizer_empty_sample(): mlb = MultiLabelBinarizer() y = [[1, 2], [1], []] Y = np.array([[1, 1], [1, 0], [0, 0]]) assert_array_equal(mlb.fit_transform(y), Y)
def test_multilabel_binarizer_non_integer_labels(): tuple_classes = _to_object_array([(1,), (2,), (3,)]) inputs = [ ([("2", "3"), ("1",), ("1", "2")], ["1", "2", "3"]), ([("b", "c"), ("a",), ("a", "b")], ["a", "b", "c"]), ([((2,), (3,)), ((1,),), ((1,), (2,))], tuple_classes), ] indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]]) for inp, classes in inputs: # fit_transform() mlb = MultiLabelBinarizer() inp = np.array(inp, dtype=object) assert_array_equal(mlb.fit_transform(inp), indicator_mat) assert_array_equal(mlb.classes_, classes) indicator_mat_inv = np.array(mlb.inverse_transform(indicator_mat), dtype=object) assert_array_equal(indicator_mat_inv, inp) # fit().transform() mlb = MultiLabelBinarizer() assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat) assert_array_equal(mlb.classes_, classes) indicator_mat_inv = np.array(mlb.inverse_transform(indicator_mat), dtype=object) assert_array_equal(indicator_mat_inv, inp) mlb = MultiLabelBinarizer() with pytest.raises(TypeError): mlb.fit_transform([({}), ({}, {"a": "b"})])
RM = pd.read_csv("../input_data/labeled_data_RM.csv") GK = pd.read_csv("../input_data/labeled_data_GK.csv") truth['labelVecs'] = '' truth['labels'] = truth['Clusters'].str.lower() truth['labels'] = truth['labels'].str.rstrip(';') truth['labels'] = truth['labels'].str.split(';') data = [prepareData(d) for d in [torsten, Mike, CK, RM, GK]] data.append(truth) clusters = set() for d in data: clusters.update([item for sublist in d['labels'] for item in sublist]) enc = MultiLabelBinarizer() enc.fit([clusters]) for result in data: for i, doc in result.iterrows(): doc['labelVecs'] = enc.transform([doc['labels']])[0] doc['labelVecs'] = doc['labelsVecs'].str.wrap(500) pd.DataFrame(enc.classes_, columns=['Label']).to_csv('labels.csv', index=False) truth.to_csv('truth.csv', index=False) torsten.to_csv('labeler1.csv', index=False) Mike.to_csv('labeler2.csv', index=False) CK.to_csv('labeler3.csv', index=False) RM.to_csv('labeler4.csv', index=False) GK.to_csv('labeler5.csv', index=False)