Exemplo n.º 1
0
    def test_big_dataset_encoding(self):
        x, y = sklearn.datasets.fetch_openml(data_id=2, return_X_y=True, as_frame=True)
        validator = InputValidator()

        with self.assertRaisesRegex(
            ValueError,
            'Categorical features in a dataframe cannot contain missing/NaN'
        ):
            x_t, y_t = validator.validate(x, y, is_classification=True)

        # Make sure translation works apart from Nan

        # NaN is not supported in categories, so
        # drop columns with them. Also, do a proof of concept
        # that all nan column is preserved, so that the pipeline deal
        # with it
        x = x.dropna('columns', 'any')
        x.insert(len(x.columns), 'NaNColumn', np.nan, True)
        x_t, y_t = validator.validate(x, y, is_classification=True)
        self.assertTupleEqual(np.shape(x), np.shape(x_t))

        self.assertTrue(np.all(pd.isnull(x_t[:, -1])))

        # Leave columns that are complete NaN
        # The sklearn pipeline will handle that
        self.assertTrue(np.isnan(x_t).any())
        np.testing.assert_array_equal(
            pd.isnull(x.dropna(axis='columns', how='all')),
            pd.isnull(x.dropna(axis='columns', how='any'))
        )

        # make sure everything was encoded to number
        self.assertTrue(np.issubdtype(x_t.dtype, np.number))

        # No change to numerical columns
        np.testing.assert_array_equal(x['carbon'].to_numpy(), x_t[:, 3])

        # Categorical columns are sorted to the beginning
        self.assertEqual(
            validator.feature_types,
            (['categorical'] * 3) + (['numerical'] * 7)
        )
        self.assertEqual(x.iloc[0, 6], 610)
        np.testing.assert_array_equal(x_t[0], [0, 0, 0, 8, 0, 0, 0.7, 610, 0, np.NaN])

        return
Exemplo n.º 2
0
    def test_NaN(self):
        # numpy - categorical - classification
        # np.nan in categorical array means that the array will be
        # type string, and np.nan will be casted as 'nan'.
        # In turn, 'nan' will be another category
        x = np.array([1, 2, 3, 4, 5.0, np.nan]).reshape(-1, 1)
        y = np.array([1, 2, 3, 4, 5.0, 6.0]).reshape(-1, 1)
        validator = InputValidator()
        x_t, y_t = validator.validate(x, y, is_classification=True)
        self.assertTrue(np.issubdtype(x_t.dtype, np.number))
        self.assertTrue(np.issubdtype(y_t.dtype, np.number))
        self.assertTrue(np.isnan(x_t).any())  # Preserve NaN in features
        self.assertEqual(type_of_target(y_t), 'multiclass')
        self.assertTupleEqual(np.shape(x), np.shape(x_t))

        # numpy - categorical - regression
        # nan in target should raise error
        y = np.random.random_sample((6, 1))
        y[1] = np.nan
        with self.assertRaisesRegex(ValueError, 'Target values cannot contain missing/NaN'):
            InputValidator().validate_target(y)

        # numpy - numerical - classification
        # Numerical numpy features should continue without encoding
        # categorical encoding of Nan for the targets is not supported
        x = np.random.random_sample((4, 4))
        x[3] = np.nan
        y = np.random.choice([0.0, 1.0], 4)
        y[1] = np.nan
        x_t = InputValidator().validate_features(x)
        self.assertTrue(np.issubdtype(x_t.dtype, np.number))
        self.assertTrue(np.isnan(x_t).any())
        self.assertEqual(type_of_target(y_t), 'multiclass')
        self.assertTupleEqual(np.shape(x), np.shape(x_t))

        with self.assertRaisesRegex(ValueError, 'Target values cannot contain missing/NaN'):
            InputValidator().validate_target(y, is_classification=True)

        with self.assertRaisesRegex(ValueError, 'Target values cannot contain missing/NaN'):
            InputValidator().validate_target(y, is_classification=False)

        x = np.random.random_sample(4)
        x[3] = np.nan
        x = pd.DataFrame(data={'A': x, 'B': x*2})
        y = np.random.choice([0.0, 1.0], 4)
        y[1] = np.nan
        y = pd.DataFrame(y)

        with self.assertRaisesRegex(ValueError, 'Categorical features in a dataframe cannot'):
            InputValidator().validate_features(x)

        with self.assertRaisesRegex(ValueError, 'Target values cannot contain missing/NaN'):
            InputValidator().validate_target(y, is_classification=True)

        with self.assertRaisesRegex(ValueError, 'Target values cannot contain missing/NaN'):
            InputValidator().validate_target(y, is_classification=False)
        return
Exemplo n.º 3
0
    def test_list_input(self):
        """
        Makes sure that a list is converted to nparray
        """
        validator = InputValidator()
        X, y = validator.validate(self.X, self.y)

        self.assertIsInstance(X, np.ndarray)
        self.assertIsInstance(y, np.ndarray)
Exemplo n.º 4
0
    def test_no_new_category_after_fit(self):
        # First make sure no problem if no categorical
        x = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8]})
        y = pd.DataFrame([1, 2, 3, 4])
        validator = InputValidator()
        validator.validate(x, y, is_classification=True)
        validator.validate_features(x)
        x['A'] = x['A'].apply(lambda x: x * x)
        validator.validate_features(x)

        # Then make sure we catch categorical extra categories
        x = pd.DataFrame({
            'A': [1, 2, 3, 4],
            'B': [5, 6, 7, 8]
        },
                         dtype='category')
        y = pd.DataFrame([1, 2, 3, 4])
        validator = InputValidator()
        validator.validate(x, y, is_classification=True)
        validator.validate_features(x)
        x['A'] = x['A'].apply(lambda x: x * x)
        with self.assertRaisesRegex(
                ValueError,
                'During fit, the input features contained categorical values'):
            validator.validate_features(x)

        # For label encoder of targets
        with self.assertRaisesRegex(
                ValueError,
                'During fit, the target array contained the categorical'):
            validator.validate_target(pd.DataFrame([1, 2, 5, 4]))

        # For ordinal encoder of targets
        x = pd.DataFrame({
            'A': [1, 2, 3, 4],
            'B': [5, 6, 7, 8]
        },
                         dtype='category')
        validator = InputValidator()
        validator.validate(x, x, is_classification=True)
        validator.validate_target(
            pd.DataFrame({
                'A': [1, 2, 3, 4],
                'B': [5, 6, 7, 8]
            },
                         dtype='category'))
        with self.assertRaisesRegex(
                ValueError,
                'During fit, the target array contained the categorical'):
            validator.validate_target(
                pd.DataFrame({
                    'A': [1, 2, 3, 4],
                    'B': [5, 9, 7, 8]
                },
                             dtype='category'))
        return
Exemplo n.º 5
0
    def test_noNaN(self):
        """
        Makes sure that during classification/regression task,
        the transformed data is not corrupted.

        Testing is given without Nan and no sparse data
        """
        # numpy - categorical - classification
        x = np.array(['a', 'b', 'c', 'a', 'b', 'c']).reshape(-1, 1)
        validator = InputValidator()
        with self.assertRaisesRegex(ValueError,
                                    'the only valid dtypes are numerical ones'):
            x_t, y_t = validator.validate(x, np.copy(x), is_classification=True)

        # numpy - categorical - regression
        with self.assertRaisesRegex(ValueError,
                                    'the only valid dtypes are numerical ones'):
            x_t, y_t = validator.validate(x, np.copy(x), is_classification=False)

        # numpy - numerical - classification
        x = np.random.random_sample((4, 4))
        y = np.random.choice([0, 1], 4)
        validator = InputValidator()
        x_t, y_t = validator.validate(x, y, is_classification=True)
        self.assertTrue(np.issubdtype(x_t.dtype, np.number))
        self.assertTrue(np.issubdtype(y_t.dtype, np.number))
        self.assertEqual(type_of_target(y_t), 'binary')
        self.assertTupleEqual(np.shape(x), np.shape(x_t))
        self.assertTupleEqual(np.shape(y), np.shape(y_t))

        # numpy - numerical - regression
        x = np.random.random_sample((4, 4))
        y = np.random.random_sample(4)
        validator = InputValidator()
        x_t, y_t = validator.validate(x, y, is_classification=False)
        np.testing.assert_array_equal(x, x_t)  # No change to valid data
        np.testing.assert_array_equal(y, y_t)
        self.assertEqual(type_of_target(y_t), 'continuous')

        # pandas - categorical - classification
        x = pd.DataFrame({'A': np.random.choice(['a', 'b'], 4),
                          'B': np.random.choice(['a', 'b'], 4)},
                         dtype='category')
        y = pd.DataFrame(np.random.choice(['c', 'd'], 4), dtype='category')
        validator = InputValidator()
        x_t, y_t = validator.validate(x, y, is_classification=True)
        self.assertTrue(np.issubdtype(x_t.dtype, np.number))
        self.assertTrue(np.issubdtype(y_t.dtype, np.number))
        self.assertEqual(type_of_target(y_t), 'binary')
        self.assertTupleEqual(np.shape(x), np.shape(x_t))
        self.assertTupleEqual(np.shape(y.to_numpy().reshape(-1)), np.shape(y_t))  # ravel

        # pandas - categorical - regression
        x = pd.DataFrame({'A': np.random.choice(['a', 'b'], 4),
                          'B': np.random.choice(['a', 'b'], 4)},
                         dtype='category')
        y = pd.DataFrame(np.random.random_sample(4))
        validator = InputValidator()
        x_t, y_t = validator.validate(x, y, is_classification=False)
        self.assertTrue(np.issubdtype(x_t.dtype, np.number))
        self.assertTrue(np.issubdtype(y_t.dtype, np.number))
        self.assertEqual(type_of_target(y_t), 'continuous')
        self.assertTupleEqual(np.shape(x), np.shape(x_t))
        np.testing.assert_array_equal(y.to_numpy().reshape(-1), y_t)
        self.assertTupleEqual(np.shape(y.to_numpy().reshape(-1)), np.shape(y_t))  # ravel version

        # pandas - numerical - classification
        x = pd.DataFrame({'A': np.random.random_sample(4),
                          'B': np.random.choice([2.5, 1.2], 4)})
        y = pd.DataFrame([1.0, 2.2, 3.2, 2.2])
        validator = InputValidator()
        x_t, y_t = validator.validate(x, y, is_classification=True)
        self.assertTrue(np.issubdtype(x_t.dtype, np.number))
        self.assertTrue(np.issubdtype(y_t.dtype, np.number))
        self.assertEqual(type_of_target(y_t), 'multiclass')
        self.assertTupleEqual(np.shape(x), np.shape(x_t))
        np.testing.assert_array_equal(np.array([0, 1, 2, 1]), y_t)
        self.assertTupleEqual(np.shape(y.to_numpy().reshape(-1)), np.shape(y_t))  # ravel

        # pandas - numerical - regression
        x = pd.DataFrame({'A': np.random.choice([1.5, 3.6], 4),
                          'B': np.random.choice([2.5, 1.2], 4)})
        y = pd.DataFrame(np.random.random_sample(4))
        validator = InputValidator()
        x_t, y_t = validator.validate(x, y, is_classification=False)
        self.assertTrue(np.issubdtype(x_t.dtype, np.number))
        self.assertTrue(np.issubdtype(y_t.dtype, np.number))
        self.assertEqual(type_of_target(y_t), 'continuous')
        self.assertTupleEqual(np.shape(x), np.shape(x_t))
        self.assertTupleEqual(np.shape(y.to_numpy().reshape(-1)), np.shape(y_t))  # ravel
        np.testing.assert_array_equal(y.to_numpy().reshape(-1), y_t)
        return