Exemplo n.º 1
0
def test_data_validation_for_regression(openmlid, as_frame):
    x, y = sklearn.datasets.fetch_openml(data_id=openmlid,
                                         return_X_y=True,
                                         as_frame=as_frame)
    validator = InputValidator(is_classification=False)

    if as_frame:
        # NaN is not supported in categories, so
        # drop columns with them.
        nan_cols = [i for i in x.columns if x[i].isnull().any()]
        cat_cols = [
            i for i in x.columns if x[i].dtype.name in ['category', 'bool']
        ]
        unsupported_columns = list(set(nan_cols) & set(cat_cols))
        if len(unsupported_columns) > 0:
            x.drop(unsupported_columns, axis=1, inplace=True)

    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
        x, y, test_size=0.33, random_state=0)

    validator.fit(X_train=X_train, y_train=y_train)

    X_train_t, y_train_t = validator.transform(X_train, y_train)
    assert np.shape(X_train) == np.shape(X_train_t)

    # Leave columns that are complete NaN
    # The sklearn pipeline will handle that
    if as_frame and np.any(pd.isnull(X_train).values.all(axis=0)):
        assert np.any(pd.isnull(X_train_t).values.all(axis=0))
    elif not as_frame and np.any(pd.isnull(X_train).all(axis=0)):
        assert np.any(pd.isnull(X_train_t).all(axis=0))

    validator.feature_validator.feat_type is not None
Exemplo n.º 2
0
    def test_big_dataset_encoding2(self):
        """
        Makes sure that when there are multiple classes,
        and test/train targets differ, we proactively encode together
        the data between test and train
        """
        X, y = sklearn.datasets.fetch_openml(data_id=183, return_X_y=True, as_frame=True)
        X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
            X,
            y,
            random_state=1
        )

        # Make sure this test makes sense, so that y_test
        # and y_train have different classes
        all_classes = set(np.unique(y_test)).union(set(np.unique(y_train)))
        elements_in_test_only = np.setdiff1d(np.unique(y_test), np.unique(y_train))
        self.assertGreater(len(elements_in_test_only), 0)

        validator = InputValidator()
        common = validator.join_and_check(
            pd.DataFrame(y),
            pd.DataFrame(y_test)
        )

        validator.validate_target(common, is_classification=True)

        encoded_classes = validator.target_encoder.classes_
        missing = all_classes - set(encoded_classes)
        self.assertEqual(len(missing), 0)
Exemplo n.º 3
0
    def test_list_input(self):
        """
        Makes sure that a list is converted to nparray
        """
        validator = InputValidator()
        X, y = validator.validate(self.X, self.y)

        self.assertIsInstance(X, np.ndarray)
        self.assertIsInstance(y, np.ndarray)
Exemplo n.º 4
0
    def test_NaN(self):
        # numpy - categorical - classification
        # np.nan in categorical array means that the array will be
        # type string, and np.nan will be casted as 'nan'.
        # In turn, 'nan' will be another category
        x = np.array([1, 2, 3, 4, 5.0, np.nan]).reshape(-1, 1)
        y = np.array([1, 2, 3, 4, 5.0, 6.0]).reshape(-1, 1)
        validator = InputValidator()
        x_t, y_t = validator.validate(x, y, is_classification=True)
        self.assertTrue(np.issubdtype(x_t.dtype, np.number))
        self.assertTrue(np.issubdtype(y_t.dtype, np.number))
        self.assertTrue(np.isnan(x_t).any())  # Preserve NaN in features
        self.assertEqual(type_of_target(y_t), 'multiclass')
        self.assertTupleEqual(np.shape(x), np.shape(x_t))

        # numpy - categorical - regression
        # nan in target should raise error
        y = np.random.random_sample((6, 1))
        y[1] = np.nan
        with self.assertRaisesRegex(ValueError, 'Target values cannot contain missing/NaN'):
            InputValidator().validate_target(y)

        # numpy - numerical - classification
        # Numerical numpy features should continue without encoding
        # categorical encoding of Nan for the targets is not supported
        x = np.random.random_sample((4, 4))
        x[3] = np.nan
        y = np.random.choice([0.0, 1.0], 4)
        y[1] = np.nan
        x_t = InputValidator().validate_features(x)
        self.assertTrue(np.issubdtype(x_t.dtype, np.number))
        self.assertTrue(np.isnan(x_t).any())
        self.assertEqual(type_of_target(y_t), 'multiclass')
        self.assertTupleEqual(np.shape(x), np.shape(x_t))

        with self.assertRaisesRegex(ValueError, 'Target values cannot contain missing/NaN'):
            InputValidator().validate_target(y, is_classification=True)

        with self.assertRaisesRegex(ValueError, 'Target values cannot contain missing/NaN'):
            InputValidator().validate_target(y, is_classification=False)

        x = np.random.random_sample(4)
        x[3] = np.nan
        x = pd.DataFrame(data={'A': x, 'B': x*2})
        y = np.random.choice([0.0, 1.0], 4)
        y[1] = np.nan
        y = pd.DataFrame(y)

        with self.assertRaisesRegex(ValueError, 'Categorical features in a dataframe cannot'):
            InputValidator().validate_features(x)

        with self.assertRaisesRegex(ValueError, 'Target values cannot contain missing/NaN'):
            InputValidator().validate_target(y, is_classification=True)

        with self.assertRaisesRegex(ValueError, 'Target values cannot contain missing/NaN'):
            InputValidator().validate_target(y, is_classification=False)
        return
Exemplo n.º 5
0
    def test_dataframe_econding_1D(self):
        """
        Test that the encoding/decoding works in 1D
        """
        validator = InputValidator()
        y = validator.validate_target(
            pd.DataFrame(data=self.y, dtype=bool),
            is_classification=True,
        )
        np.testing.assert_array_almost_equal(np.array([0, 1, 0]), y)

        # Result should not change on a multi call
        y = validator.validate_target(pd.DataFrame(data=self.y, dtype=bool))
        np.testing.assert_array_almost_equal(np.array([0, 1, 0]), y)

        y_decoded = validator.decode_target(y)
        np.testing.assert_array_almost_equal(np.array(self.y, dtype=bool), y_decoded)

        # Now go with categorical data
        validator = InputValidator()
        y = validator.validate_target(
            pd.DataFrame(data=['a', 'a', 'b', 'c', 'a'], dtype='category'),
            is_classification=True,
        )
        np.testing.assert_array_almost_equal(np.array([0, 0, 1, 2, 0]), y)

        y_decoded = validator.decode_target(y)
        self.assertListEqual(['a', 'a', 'b', 'c', 'a'], y_decoded.tolist())
Exemplo n.º 6
0
    def test_numpy_input(self):
        """
        Makes sure that no encoding is needed for a
        numpy float object. Also test features/target
        validation methods
        """
        validator = InputValidator()
        X = validator.validate_features(np.array(self.X), )
        y = validator.validate_target(np.array(self.y))

        self.assertIsInstance(X, np.ndarray)
        self.assertIsInstance(y, np.ndarray)
        self.assertIsNone(validator.target_encoder)
        self.assertIsNone(validator.feature_encoder)
Exemplo n.º 7
0
def test_validation_unsupported():
    """
    Makes sure we raise a proper message to the user,
    when providing not supported data input
    """
    validator = InputValidator()
    with pytest.raises(ValueError,
                       match=r"Inconsistent number of train datapoints.*"):
        validator.fit(
            X_train=np.array([[0, 1, 0], [0, 1, 1]]),
            y_train=np.array([0, 1, 0, 0, 0, 0]),
        )
    with pytest.raises(ValueError,
                       match=r"Inconsistent number of test datapoints.*"):
        validator.fit(
            X_train=np.array([[0, 1, 0], [0, 1, 1]]),
            y_train=np.array([0, 1]),
            X_test=np.array([[0, 1, 0], [0, 1, 1]]),
            y_test=np.array([0, 1, 0, 0, 0, 0]),
        )
    with pytest.raises(ValueError,
                       match=r"Cannot call transform on a validator .*fitted"):
        validator.transform(
            X=np.array([[0, 1, 0], [0, 1, 1]]),
            y=np.array([0, 1]),
        )
Exemplo n.º 8
0
    def test_dataframe_input_numerical(self):
        """
        Makes sure that we don't encode numerical data
        """
        for test_type in ['int64', 'float64', 'int8']:
            validator = InputValidator()
            X = validator.validate_features(
                pd.DataFrame(data=self.X, dtype=test_type), )
            y = validator.validate_target(
                pd.DataFrame(data=self.y, dtype=test_type), )

            self.assertIsInstance(X, np.ndarray)
            self.assertIsInstance(y, np.ndarray)
            self.assertIsNone(validator.target_encoder)
            self.assertIsNone(validator.feature_encoder)
Exemplo n.º 9
0
 def test_regression_conversion(self):
     """
     Makes sure that a regression input
     properly retains the continious target type
     """
     for input_object in [
         [1.0, 76.9, 123, 4.0, 81.1],
         np.array([1.0, 76.9, 123, 4.0, 81.1]),
         pd.DataFrame([1.0, 76.9, 123, 4.0, 81.1]),
     ]:
         validator = InputValidator()
         y_train = validator.validate_target(
             input_object,
             is_classification=False,
         )
         self.assertEqual('continuous', type_of_target(y_train))
Exemplo n.º 10
0
def test_refit_shuffle_on_fail(backend, dask_client):

    failing_model = unittest.mock.Mock()
    failing_model.fit.side_effect = [ValueError(), ValueError(), None]
    failing_model.fit_transformer.side_effect = [
        ValueError(), ValueError(), (None, {})]
    failing_model.get_max_iter.return_value = 100

    auto = AutoML(backend, 30, 5, dask_client=dask_client)
    ensemble_mock = unittest.mock.Mock()
    ensemble_mock.get_selected_model_identifiers.return_value = [(1, 1, 50.0)]
    auto.ensemble_ = ensemble_mock
    auto.InputValidator = InputValidator()
    for budget_type in [None, 'iterations']:
        auto._budget_type = budget_type

        auto.models_ = {(1, 1, 50.0): failing_model}

        # Make sure a valid 2D array is given to automl
        X = np.array([1, 2, 3]).reshape(-1, 1)
        y = np.array([1, 2, 3])
        auto.InputValidator.fit(X, y)
        auto.refit(X, y)

        assert failing_model.fit.call_count == 3
    assert failing_model.fit_transformer.call_count == 3

    del auto
Exemplo n.º 11
0
    def test_dataframe_econding_2D(self):
        """
        Test that the encoding/decoding works in 2D
        """
        validator = InputValidator()
        multi_label = pd.DataFrame(np.array([[1, 0, 0, 1], [0, 0, 1, 1],
                                             [0, 0, 0, 0]]),
                                   dtype=bool)
        y = validator.validate_target(multi_label, is_classification=True)

        # Result should not change on a multi call
        y_new = validator.validate_target(multi_label)
        np.testing.assert_array_almost_equal(y_new, y)

        y_decoded = validator.decode_target(y)
        np.testing.assert_array_almost_equal(y, y_decoded)
Exemplo n.º 12
0
 def test_multilabel_conversion(self):
     """
     Makes sure that a encoded target for classification
     properly retains the multilabel target type
     """
     # Multi-label conversion for different datatype
     for input_object in [
         [[1, 0, 0, 1], [0, 0, 1, 1], [0, 0, 0, 0]],
         np.array([[1, 0, 0, 1], [0, 0, 1, 1], [0, 0, 0, 0]]),
         pd.DataFrame([[1, 0, 0, 1], [0, 0, 1, 1], [0, 0, 0, 0]], dtype='category'),
     ]:
         validator = InputValidator()
         y_train = validator.validate_target(
             input_object,
             is_classification=True,
         )
         self.assertEqual('multilabel-indicator', type_of_target(y_train))
Exemplo n.º 13
0
def test_data_validation_for_classification(openmlid, as_frame):
    x, y = sklearn.datasets.fetch_openml(data_id=openmlid,
                                         return_X_y=True,
                                         as_frame=as_frame)
    validator = InputValidator(is_classification=True)

    if as_frame:
        # NaN is not supported in categories, so
        # drop columns with them.
        nan_cols = [i for i in x.columns if x[i].isnull().any()]
        cat_cols = [
            i for i in x.columns if x[i].dtype.name in ['category', 'bool']
        ]
        unsupported_columns = list(set(nan_cols) & set(cat_cols))
        if len(unsupported_columns) > 0:
            x.drop(unsupported_columns, axis=1, inplace=True)

    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
        x, y, test_size=0.33, random_state=0)

    validator.fit(X_train=X_train,
                  y_train=y_train,
                  X_test=X_test,
                  y_test=y_test)

    X_train_t, y_train_t = validator.transform(X_train, y_train)
    assert np.shape(X_train) == np.shape(X_train_t)

    # Leave columns that are complete NaN
    # The sklearn pipeline will handle that
    if as_frame and np.any(pd.isnull(X_train).values.all(axis=0)):
        assert np.any(pd.isnull(X_train_t).values.all(axis=0))
    elif not as_frame and np.any(pd.isnull(X_train).all(axis=0)):
        assert np.any(pd.isnull(X_train_t).all(axis=0))

    # make sure everything was encoded to number
    assert np.issubdtype(X_train_t.dtype, np.number)
    assert np.issubdtype(y_train_t.dtype, np.number)

    # Categorical columns are sorted to the beginning
    if as_frame:
        validator.feature_validator.feat_type is not None
        ordered_unique_elements = list(
            dict.fromkeys(validator.feature_validator.feat_type))
        if len(ordered_unique_elements) > 1:
            assert ordered_unique_elements[0] == 'categorical'
Exemplo n.º 14
0
    def test_dataframe_input_categorical(self):
        """
        Makes sure we automatically encode categorical data
        """
        for test_type in ['bool', 'category']:
            validator = InputValidator()
            X = validator.validate_features(
                pd.DataFrame(data=self.X, dtype=test_type), )
            y = validator.validate_target(
                pd.DataFrame(data=self.y, dtype=test_type),
                is_classification=True,
            )

            self.assertIsInstance(X, np.ndarray)
            self.assertIsInstance(y, np.ndarray)
            self.assertIsNotNone(validator.target_encoder)
            self.assertIsNotNone(validator.feature_encoder)
Exemplo n.º 15
0
 def test_continuous_multioutput_conversion(self):
     """
     Makes sure that an input for regression
     properly retains the multiout continious target type
     """
     # Regression multi out conversion for different datatype
     for input_object in [
         [[31.4, 94], [40.5, 109], [25.0, 30]],
         np.array([[31.4, 94], [40.5, 109], [25.0, 30]]),
         pd.DataFrame([[31.4, 94], [40.5, 109], [25.0, 30]]),
     ]:
         validator = InputValidator()
         y_train = validator.validate_target(
             input_object,
             is_classification=False,
         )
         self.assertEqual('continuous-multioutput', type_of_target(y_train))
Exemplo n.º 16
0
 def test_multiclass_conversion(self):
     """
     Makes sure that a encoded target for classification
     properly retains the multiclass target type
     """
     # Multiclass conversion for different datatype
     for input_object in [
         [1.0, 2.0, 2.0, 4.0, 3],
         np.array([1.0, 2.0, 2.0, 4.0, 3], dtype=np.float64),
         pd.DataFrame([1.0, 2.0, 2.0, 4.0, 3], dtype='category'),
     ]:
         validator = InputValidator()
         y_train = validator.validate_target(
             input_object,
             is_classification=True,
         )
         self.assertEqual('multiclass', type_of_target(y_train))
Exemplo n.º 17
0
    def test_big_dataset_encoding(self):
        x, y = sklearn.datasets.fetch_openml(data_id=2, return_X_y=True, as_frame=True)
        validator = InputValidator()

        with self.assertRaisesRegex(
            ValueError,
            'Categorical features in a dataframe cannot contain missing/NaN'
        ):
            x_t, y_t = validator.validate(x, y, is_classification=True)

        # Make sure translation works apart from Nan

        # NaN is not supported in categories, so
        # drop columns with them. Also, do a proof of concept
        # that all nan column is preserved, so that the pipeline deal
        # with it
        x = x.dropna('columns', 'any')
        x.insert(len(x.columns), 'NaNColumn', np.nan, True)
        x_t, y_t = validator.validate(x, y, is_classification=True)
        self.assertTupleEqual(np.shape(x), np.shape(x_t))

        self.assertTrue(np.all(pd.isnull(x_t[:, -1])))

        # Leave columns that are complete NaN
        # The sklearn pipeline will handle that
        self.assertTrue(np.isnan(x_t).any())
        np.testing.assert_array_equal(
            pd.isnull(x.dropna(axis='columns', how='all')),
            pd.isnull(x.dropna(axis='columns', how='any'))
        )

        # make sure everything was encoded to number
        self.assertTrue(np.issubdtype(x_t.dtype, np.number))

        # No change to numerical columns
        np.testing.assert_array_equal(x['carbon'].to_numpy(), x_t[:, 3])

        # Categorical columns are sorted to the beginning
        self.assertEqual(
            validator.feature_types,
            (['categorical'] * 3) + (['numerical'] * 7)
        )
        self.assertEqual(x.iloc[0, 6], 610)
        np.testing.assert_array_equal(x_t[0], [0, 0, 0, 8, 0, 0, 0.7, 610, 0, np.NaN])

        return
Exemplo n.º 18
0
    def test_binary_conversion(self):
        """
        Makes sure that a encoded target for classification
        properly retains the binary target type
        """
        validator = InputValidator()

        # Just 2 classes, 1 and 2
        y_train = validator.validate_target(
            np.array([1.0, 2.0, 2.0, 1.0], dtype=np.float64),
            is_classification=True,
        )
        self.assertEqual('binary', type_of_target(y_train))

        # Also make sure that a re-use of the generator is also binary
        y_valid = validator.validate_target(
            np.array([2.0, 2.0, 2.0, 2.0], dtype=np.float64),
            is_classification=True,
        )
        self.assertEqual('binary', type_of_target(y_valid))

        # Make sure binary also works with PD dataframes
        validator = InputValidator()

        # Just 2 classes, 1 and 2
        y_train = validator.validate_target(
            pd.DataFrame([1.0, 2.0, 2.0, 1.0], dtype='category'),
            is_classification=True,
        )
        self.assertEqual('binary', type_of_target(y_train))
Exemplo n.º 19
0
    def test_all_posible_dtype_changes(self):
        """We do not allow a change in dtype once inputvalidator
        is fitted"""
        data = [[1, 0, 1], [1, 1, 1]]
        type_perms = list(itertools.permutations([
            data,
            np.array(data),
            pd.DataFrame(data)
        ], r=2))

        for first, second in type_perms:
            validator = InputValidator()
            validator.validate_target(first)
            with self.assertRaisesRegex(ValueError,
                                        "Auto-sklearn previously received targets of type"):
                validator.validate_target(second)
            validator.validate_features(first)
            with self.assertRaisesRegex(ValueError,
                                        "Auto-sklearn previously received features of type"):
                validator.validate_features(second)
Exemplo n.º 20
0
    def test_sparse_numpy_input(self):
        """
        Makes sure that no encoder is needed when
        working with sparse float data
        """
        validator = InputValidator()

        # Sparse data
        row_ind = np.array([0, 1, 2])
        col_ind = np.array([1, 2, 1])
        X_sparse = sparse.csr_matrix((np.ones(3), (row_ind, col_ind)))
        X = validator.validate_features(
            X_sparse,
        )
        y = validator.validate_target(
            np.array(self.y)
        )

        self.assertIsInstance(X, sparse.csr.csr_matrix)
        self.assertIsInstance(y, np.ndarray)
        self.assertIsNone(validator.target_encoder)
        self.assertIsNone(validator.feature_encoder)

        # Sparse targets should not be supported
        data = np.array([1, 2, 3, 4, 5, 6])
        col = np.array([0, 0, 0, 0, 0, 0])
        row = np.array([0,  2,  3,  6,  7, 10])
        y = sparse.csr_matrix((data, (row, col)), shape=(11, 1))
        with self.assertRaisesRegex(ValueError, 'scipy.sparse.csr_matrix.todense'):
            validator = InputValidator().validate_target(y)
Exemplo n.º 21
0
def test_multiclass_prediction(predict_mock, dask_client):
    predicted_probabilities = [[0, 0, 0.99], [0, 0.99, 0], [0.99, 0, 0],
                               [0, 0.99, 0], [0, 0, 0.99]]
    predicted_indexes = [2, 1, 0, 1, 2]
    expected_result = ['c', 'b', 'a', 'b', 'c']

    predict_mock.return_value = np.array(predicted_probabilities)

    classifier = AutoMLClassifier(
        time_left_for_this_task=1,
        per_run_time_limit=1,
        dask_client=dask_client,
    )
    classifier.InputValidator = InputValidator(is_classification=True)
    classifier.InputValidator.target_validator.fit(
        pd.DataFrame(expected_result, dtype='category'), )
    classifier.InputValidator._is_fitted = True

    actual_result = classifier.predict([None] * len(predicted_indexes))

    np.testing.assert_array_equal(expected_result, actual_result)
Exemplo n.º 22
0
def test_multilabel_prediction(predict_mock, dask_client):
    predicted_probabilities = [[0.99, 0], [0.99, 0], [0, 0.99], [0.99, 0.99],
                               [0.99, 0.99]]
    predicted_indexes = np.array([[1, 0], [1, 0], [0, 1], [1, 1], [1, 1]])

    predict_mock.return_value = np.array(predicted_probabilities)

    classifier = AutoMLClassifier(
        time_left_for_this_task=1,
        per_run_time_limit=1,
        dask_client=dask_client,
    )
    classifier.InputValidator = InputValidator(is_classification=True)
    classifier.InputValidator.target_validator.fit(
        pd.DataFrame(predicted_indexes, dtype='int64'), )
    classifier.InputValidator._is_fitted = True

    assert classifier.InputValidator.target_validator.type_of_target == 'multilabel-indicator'

    actual_result = classifier.predict([None] * len(predicted_indexes))

    np.testing.assert_array_equal(predicted_indexes, actual_result)
Exemplo n.º 23
0
def test_sparse_data_validation_for_regression():
    X, y = sklearn.datasets.make_regression(n_samples=100,
                                            n_features=50,
                                            random_state=0)
    X_sp = sparse.coo_matrix(X)
    validator = InputValidator(is_classification=False)

    validator.fit(X_train=X_sp, y_train=y)

    X_t, y_t = validator.transform(X, y)
    assert np.shape(X) == np.shape(X_t)

    # make sure everything was encoded to number
    assert np.issubdtype(X_t.dtype, np.number)
    assert np.issubdtype(y_t.dtype, np.number)

    # Make sure we can change the sparse format
    X_t, y_t = validator.transform(sparse.csr_matrix(X), y)
Exemplo n.º 24
0
    def test_join_and_check(self):
        validator = InputValidator()

        # Numpy Testing
        y = np.array([2, 2, 3, 4, 5])
        y_test = np.array([3, 4, 5, 6, 1])

        joined = validator.join_and_check(y, y_test)
        np.testing.assert_array_equal(
            joined,
            np.array([2, 2, 3, 4, 5, 3, 4, 5, 6, 1])
        )

        validator.validate_target(joined, is_classification=True)
        y_encoded = validator.validate_target(y)
        y_test_encoded = validator.validate_target(y_test)

        # If a common encoding happened, then common elements
        # should have a common encoding
        self.assertEqual(y_encoded[2], y_test_encoded[0])

        # Pandas Testing
        validator = InputValidator()
        joined = validator.join_and_check(
            pd.DataFrame(y),
            pd.DataFrame(y_test)
        )
        np.testing.assert_array_equal(
            joined,
            pd.DataFrame([2, 2, 3, 4, 5, 3, 4, 5, 6, 1])
        )

        # List Testing
        validator = InputValidator()
        joined = validator.join_and_check(
            [2, 2, 3, 4, 5],
            [3, 4, 5, 6, 1]
        )
        np.testing.assert_array_equal(
            joined,
            [2, 2, 3, 4, 5, 3, 4, 5, 6, 1]
        )

        # Make sure some messages are triggered
        y = np.array([[1, 0, 0, 1], [0, 0, 1, 1], [0, 0, 0, 0]])
        y_test = np.array([3, 4, 5, 6, 1])
        with self.assertRaisesRegex(
            ValueError,
            'Train and test targets must have the same dimensionality'
        ):
            joined = validator.join_and_check(y, y_test)
        with self.assertRaisesRegex(
            ValueError,
            'Train and test targets must be of the same type'
        ):
            joined = validator.join_and_check(y, pd.DataFrame(y_test))
Exemplo n.º 25
0
    def test_no_new_category_after_fit(self):
        # First make sure no problem if no categorical
        x = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8]})
        y = pd.DataFrame([1, 2, 3, 4])
        validator = InputValidator()
        validator.validate(x, y, is_classification=True)
        validator.validate_features(x)
        x['A'] = x['A'].apply(lambda x: x*x)
        validator.validate_features(x)

        # Then make sure we catch categorical extra categories
        x = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8]}, dtype='category')
        y = pd.DataFrame([1, 2, 3, 4])
        validator = InputValidator()
        validator.validate(x, y, is_classification=True)
        validator.validate_features(x)
        x['A'] = x['A'].apply(lambda x: x*x)
        with self.assertRaisesRegex(
            ValueError,
            'During fit, the input features contained categorical values'
        ):
            validator.validate_features(x)

        # For label encoder of targets
        with self.assertRaisesRegex(
            ValueError,
            'During fit, the target array contained the categorical'
        ):
            validator.validate_target(pd.DataFrame([1, 2, 5, 4]))

        # For ordinal encoder of targets
        x = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8]}, dtype='category')
        validator = InputValidator()
        validator.validate(x, x, is_classification=True)
        validator.validate_target(pd.DataFrame(
            {'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8]}, dtype='category')
        )
        with self.assertRaisesRegex(
            ValueError,
            'During fit, the target array contained the categorical'
        ):
            validator.validate_target(pd.DataFrame(
                {'A': [1, 2, 3, 4], 'B': [5, 9, 7, 8]}, dtype='category')
            )
        return
Exemplo n.º 26
0
    def test_noNaN(self):
        """
        Makes sure that during classification/regression task,
        the transformed data is not corrupted.

        Testing is given without Nan and no sparse data
        """
        # numpy - categorical - classification
        x = np.array(['a', 'b', 'c', 'a', 'b', 'c']).reshape(-1, 1)
        validator = InputValidator()
        with self.assertRaisesRegex(ValueError,
                                    'the only valid dtypes are numerical ones'):
            x_t, y_t = validator.validate(x, np.copy(x), is_classification=True)

        # numpy - categorical - regression
        with self.assertRaisesRegex(ValueError,
                                    'the only valid dtypes are numerical ones'):
            x_t, y_t = validator.validate(x, np.copy(x), is_classification=False)

        # numpy - numerical - classification
        x = np.random.random_sample((4, 4))
        y = np.random.choice([0, 1], 4)
        validator = InputValidator()
        x_t, y_t = validator.validate(x, y, is_classification=True)
        self.assertTrue(np.issubdtype(x_t.dtype, np.number))
        self.assertTrue(np.issubdtype(y_t.dtype, np.number))
        self.assertEqual(type_of_target(y_t), 'binary')
        self.assertTupleEqual(np.shape(x), np.shape(x_t))
        self.assertTupleEqual(np.shape(y), np.shape(y_t))

        # numpy - numerical - regression
        x = np.random.random_sample((4, 4))
        y = np.random.random_sample(4)
        validator = InputValidator()
        x_t, y_t = validator.validate(x, y, is_classification=False)
        np.testing.assert_array_equal(x, x_t)  # No change to valid data
        np.testing.assert_array_equal(y, y_t)
        self.assertEqual(type_of_target(y_t), 'continuous')

        # pandas - categorical - classification
        x = pd.DataFrame({'A': np.random.choice(['a', 'b'], 4),
                          'B': np.random.choice(['a', 'b'], 4)},
                         dtype='category')
        y = pd.DataFrame(np.random.choice(['c', 'd'], 4), dtype='category')
        validator = InputValidator()
        x_t, y_t = validator.validate(x, y, is_classification=True)
        self.assertTrue(np.issubdtype(x_t.dtype, np.number))
        self.assertTrue(np.issubdtype(y_t.dtype, np.number))
        self.assertEqual(type_of_target(y_t), 'binary')
        self.assertTupleEqual(np.shape(x), np.shape(x_t))
        self.assertTupleEqual(np.shape(y.to_numpy().reshape(-1)), np.shape(y_t))  # ravel

        # pandas - categorical - regression
        x = pd.DataFrame({'A': np.random.choice(['a', 'b'], 4),
                          'B': np.random.choice(['a', 'b'], 4)},
                         dtype='category')
        y = pd.DataFrame(np.random.random_sample(4))
        validator = InputValidator()
        x_t, y_t = validator.validate(x, y, is_classification=False)
        self.assertTrue(np.issubdtype(x_t.dtype, np.number))
        self.assertTrue(np.issubdtype(y_t.dtype, np.number))
        self.assertEqual(type_of_target(y_t), 'continuous')
        self.assertTupleEqual(np.shape(x), np.shape(x_t))
        np.testing.assert_array_equal(y.to_numpy().reshape(-1), y_t)
        self.assertTupleEqual(np.shape(y.to_numpy().reshape(-1)), np.shape(y_t))  # ravel version

        # pandas - numerical - classification
        x = pd.DataFrame({'A': np.random.random_sample(4),
                          'B': np.random.choice([2.5, 1.2], 4)})
        y = pd.DataFrame([1.0, 2.2, 3.2, 2.2])
        validator = InputValidator()
        x_t, y_t = validator.validate(x, y, is_classification=True)
        self.assertTrue(np.issubdtype(x_t.dtype, np.number))
        self.assertTrue(np.issubdtype(y_t.dtype, np.number))
        self.assertEqual(type_of_target(y_t), 'multiclass')
        self.assertTupleEqual(np.shape(x), np.shape(x_t))
        np.testing.assert_array_equal(np.array([0, 1, 2, 1]), y_t)
        self.assertTupleEqual(np.shape(y.to_numpy().reshape(-1)), np.shape(y_t))  # ravel

        # pandas - numerical - regression
        x = pd.DataFrame({'A': np.random.choice([1.5, 3.6], 4),
                          'B': np.random.choice([2.5, 1.2], 4)})
        y = pd.DataFrame(np.random.random_sample(4))
        validator = InputValidator()
        x_t, y_t = validator.validate(x, y, is_classification=False)
        self.assertTrue(np.issubdtype(x_t.dtype, np.number))
        self.assertTrue(np.issubdtype(y_t.dtype, np.number))
        self.assertEqual(type_of_target(y_t), 'continuous')
        self.assertTupleEqual(np.shape(x), np.shape(x_t))
        self.assertTupleEqual(np.shape(y.to_numpy().reshape(-1)), np.shape(y_t))  # ravel
        np.testing.assert_array_equal(y.to_numpy().reshape(-1), y_t)
        return
Exemplo n.º 27
0
    def test_dataframe_input_unsupported(self):
        """
        Makes sure we raise a proper message to the user,
        when providing not supported data input
        """
        validator = InputValidator()
        with self.assertRaisesRegex(ValueError, "Auto-sklearn does not support time"):
            validator.validate_features(
                pd.DataFrame({'datetime': [pd.Timestamp('20180310')]})
            )
        with self.assertRaisesRegex(ValueError, "has invalid type object"):
            validator.validate_features(
                pd.DataFrame({'string': ['foo']})
            )

        validator = InputValidator()
        with self.assertRaisesRegex(ValueError, "Expected 2D array, got"):
            validator.validate_features({'input1': 1, 'input2': 2})

        validator = InputValidator()
        with self.assertRaisesRegex(ValueError, "Expected 2D array, got"):
            validator.validate_features(InputValidator())

        validator = InputValidator()
        X = pd.DataFrame(data=['a', 'b', 'c'], dtype='category')
        with unittest.mock.patch('autosklearn.data.validation.InputValidator._check_and_get_columns_to_encode') as mock_foo:  # noqa E501
            # Mock that all columns are ok. There should be a
            # checker to catch for bugs
            mock_foo.return_value = ([], [])
            with self.assertRaisesRegex(ValueError, 'Failed to convert the input'):
                validator.validate_features(X)