示例#1
0
def test_targetvalidator_inversetransform():
    """
    Test that the encoding/decoding works in 1D
    """
    validator = TargetValidator(is_classification=True)
    validator.fit(
        pd.DataFrame(data=['a', 'a', 'b', 'c', 'a'], dtype='category'),
    )
    y = validator.transform(
        pd.DataFrame(data=['a', 'a', 'b', 'c', 'a'], dtype='category'),
    )
    np.testing.assert_array_almost_equal(np.array([0, 0, 1, 2, 0]), y)

    y_decoded = validator.inverse_transform(y)
    assert ['a', 'a', 'b', 'c', 'a'] == y_decoded.tolist()

    assert validator.classes_.tolist() == ['a', 'b', 'c']

    validator = TargetValidator(is_classification=True)
    multi_label = pd.DataFrame(
        np.array([[1, 0, 0, 1], [0, 0, 1, 1], [0, 0, 0, 0]]),
        dtype=bool
    )
    validator.fit(multi_label)
    y = validator.transform(multi_label)

    y_decoded = validator.inverse_transform(y)
    np.testing.assert_array_almost_equal(y, y_decoded)

    # Multilabel classification is not encoded
    # For this reason, classes_ attribute does not contain a class
    np.testing.assert_array_almost_equal(validator.classes_, np.array([]))
def test_targetvalidator_inversetransform():
    """
    Test that the encoding/decoding works in 1D
    """
    validator = TargetValidator(is_classification=True)
    validator.fit(
        pd.DataFrame(data=['a', 'a', 'b', 'c', 'a'], dtype='category'),
    )
    y = validator.transform(
        pd.DataFrame(data=['a', 'a', 'b', 'c', 'a'], dtype='category'),
    )
    np.testing.assert_array_almost_equal(np.array([0, 0, 1, 2, 0]), y)

    y_decoded = validator.inverse_transform(y)
    assert ['a', 'a', 'b', 'c', 'a'] == y_decoded.tolist()

    validator = TargetValidator(is_classification=True)
    multi_label = pd.DataFrame(
        np.array([[1, 0, 0, 1], [0, 0, 1, 1], [0, 0, 0, 0]]),
        dtype=bool
    )
    validator.fit(multi_label)
    y = validator.transform(multi_label)

    y_decoded = validator.inverse_transform(y)
    np.testing.assert_array_almost_equal(y, y_decoded)
示例#3
0
def test_target_unsupported():
    """
    Makes sure we raise a proper message to the user,
    when providing not supported data input
    """
    validator = TargetValidator(is_classification=True)
    with pytest.raises(ValueError, match=r"The dimensionality of the train and test targets"):
        validator.fit(
            np.array([[0, 1, 0], [0, 1, 1]]),
            np.array([[0, 1, 0, 0], [0, 1, 1, 1]]),
        )
    with pytest.raises(ValueError, match=r"Train and test targets must both have the same dtypes"):
        validator.fit(
            pd.DataFrame({'a': [1, 2, 3]}),
            pd.DataFrame({'a': [True, False, False]}),
        )
    with pytest.raises(ValueError, match=r"Provided targets are not supported.*"):
        validator.fit(
            np.array([[0, 1, 2], [0, 3, 4]]),
            np.array([[0, 1, 2, 5], [0, 3, 4, 6]]),
        )
    with pytest.raises(ValueError, match="Train and test targets must both have the same"):
        validator.fit(
            pd.DataFrame({'string': ['foo']}),
            pd.DataFrame({'int': [1]}),
        )
    with pytest.raises(ValueError, match=r"Auto-sklearn only supports Numpy arrays, .*"):
        validator.fit({'input1': 1, 'input2': 2})
    with pytest.raises(ValueError, match=r"arget values cannot contain missing/NaN values"):
        validator.fit(np.array([np.nan, 1, 2]))
    with pytest.raises(ValueError, match=r"arget values cannot contain missing/NaN values"):
        validator.fit(sparse.csr_matrix(np.array([1, 2, np.nan])))
    with pytest.raises(ValueError, match=r"Cannot call transform on a validator that is not fit"):
        validator.transform(np.array([1, 2, 3]))
    with pytest.raises(ValueError, match=r"Cannot call inverse_transform on a validator that is"):
        validator.inverse_transform(np.array([1, 2, 3]))
    with pytest.raises(ValueError, match=r"Multi-dimensional classification is not yet supported"):
        validator._fit(np.array([[1, 2, 3], [1, 5, 6]]))

    # Dia/ DOK are not supported as type of target makes calls len on the array
    # which causes TypeError: len() of unsized object. Basically, sparse data as
    # multi-label is the only thing that makes sense in this format.
    with pytest.raises(ValueError, match=r"The provided data could not be interpreted by Sklearn"):
        validator.fit(sparse.dia_matrix(np.array([1, 2, 3])))

    validator.fit(np.array([[0, 1, 0], [0, 1, 1]]))
    with pytest.raises(ValueError, match=r"Number of outputs changed from"):
        validator.fit(np.array([0, 1, 0]))
示例#4
0
def test_targetvalidator_supported_types_classification(input_data_targettest):
    validator = TargetValidator(is_classification=True)
    validator.fit(input_data_targettest)
    transformed_y = validator.transform(input_data_targettest)
    if sparse.issparse(input_data_targettest):
        assert sparse.issparse(transformed_y)
    else:
        assert isinstance(transformed_y, np.ndarray)
    epected_shape = np.shape(input_data_targettest)
    if len(epected_shape) > 1 and epected_shape[1] == 1:
        # The target should have (N,) dimensionality instead of (N, 1)
        epected_shape = (epected_shape[0], )
    assert epected_shape == np.shape(transformed_y)
    assert np.issubdtype(transformed_y.dtype, np.number)
    assert validator._is_fitted

    # Because there is no classification, we do not expect a encoder
    if not sparse.issparse(input_data_targettest):
        assert validator.encoder is not None

        # The encoding should be per column
        if len(transformed_y.shape) == 1:
            assert np.min(transformed_y) == 0
            assert np.max(transformed_y) == len(np.unique(transformed_y)) - 1
        else:
            for col in range(transformed_y.shape[1]):
                assert np.min(transformed_y[:, col]) == 0
                assert np.max(transformed_y[:, col]) == len(np.unique(transformed_y[:, col])) - 1

        # Make sure we can perform inverse transform
        y_inverse = validator.inverse_transform(transformed_y)
        if hasattr(input_data_targettest, 'dtype'):
            # In case of numeric, we need to make sure dtype is preserved
            if is_numeric_dtype(input_data_targettest.dtype):
                assert y_inverse.dtype == input_data_targettest.dtype
            # Then make sure every value is properly inverse-transformed
            np.testing.assert_array_equal(np.array(y_inverse), np.array(input_data_targettest))
        elif hasattr(input_data_targettest, 'dtypes'):
            if is_numeric_dtype(input_data_targettest.dtypes[0]):
                assert y_inverse.dtype == input_data_targettest.dtypes[0]
            # Then make sure every value is properly inverse-transformed
            np.testing.assert_array_equal(np.array(y_inverse),
                                          # pandas is always (N, 1) but targets are ravel()
                                          input_data_targettest.to_numpy().reshape(-1))
    else:
        # Sparse is not encoded, mainly because the sparse data is expected
        # to be numpy of numerical type -- which currently does not require encoding
        np.testing.assert_array_equal(
            np.ravel(input_data_targettest.todense()),
            np.ravel(transformed_y.todense())
        )