def test_one_hot_encoder_set_params(): X = np.array([[1, 2]]).T oh = OneHotEncoder() # set params on not yet fitted object oh.set_params(categories=[[0, 1, 2, 3]]) assert oh.get_params()['categories'] == [[0, 1, 2, 3]] assert oh.fit_transform(X).toarray().shape == (2, 4) # set params on already fitted object oh.set_params(categories=[[0, 1, 2, 3, 4]]) assert oh.fit_transform(X).toarray().shape == (2, 5)
def check_categorical_onehot(X): enc = OneHotEncoder(categories='auto') Xtr1 = enc.fit_transform(X) enc = OneHotEncoder(categories='auto', sparse=False) Xtr2 = enc.fit_transform(X) assert_allclose(Xtr1.toarray(), Xtr2) assert sparse.isspmatrix_csr(Xtr1) return Xtr1.toarray()
def test_one_hot_encoder_dtype(input_dtype, output_dtype): X = np.asarray([[0, 1]], dtype=input_dtype).T X_expected = np.asarray([[1, 0], [0, 1]], dtype=output_dtype) oh = OneHotEncoder(categories='auto', dtype=output_dtype) assert_array_equal(oh.fit_transform(X).toarray(), X_expected) assert_array_equal(oh.fit(X).transform(X).toarray(), X_expected) oh = OneHotEncoder(categories='auto', dtype=output_dtype, sparse=False) assert_array_equal(oh.fit_transform(X), X_expected) assert_array_equal(oh.fit(X).transform(X), X_expected)
def test_one_hot_encoder_dtype_pandas(output_dtype): pd = pytest.importorskip('pandas') X_df = pd.DataFrame({'A': ['a', 'b'], 'B': [1, 2]}) X_expected = np.array([[1, 0, 1, 0], [0, 1, 0, 1]], dtype=output_dtype) oh = OneHotEncoder(dtype=output_dtype) assert_array_equal(oh.fit_transform(X_df).toarray(), X_expected) assert_array_equal(oh.fit(X_df).transform(X_df).toarray(), X_expected) oh = OneHotEncoder(dtype=output_dtype, sparse=False) assert_array_equal(oh.fit_transform(X_df), X_expected) assert_array_equal(oh.fit(X_df).transform(X_df), X_expected)
def test_one_hot_encoder_unsorted_categories(): X = np.array([['a', 'b']], dtype=object).T enc = OneHotEncoder(categories=[['b', 'a', 'c']]) exp = np.array([[0., 1., 0.], [1., 0., 0.]]) assert_array_equal(enc.fit(X).transform(X).toarray(), exp) assert_array_equal(enc.fit_transform(X).toarray(), exp) assert enc.categories_[0].tolist() == ['b', 'a', 'c'] assert np.issubdtype(enc.categories_[0].dtype, np.object_) # unsorted passed categories still raise for numerical values X = np.array([[1, 2]]).T enc = OneHotEncoder(categories=[[2, 1, 3]]) msg = 'Unsorted categories are not supported' with pytest.raises(ValueError, match=msg): enc.fit_transform(X)
def test_one_hot_encoder_sparse_dense(): # check that sparse and dense will give the same results X = np.array([[3, 2, 1], [0, 1, 1]]) enc_sparse = OneHotEncoder() enc_dense = OneHotEncoder(sparse=False) X_trans_sparse = enc_sparse.fit_transform(X) X_trans_dense = enc_dense.fit_transform(X) assert X_trans_sparse.shape == (2, 5) assert X_trans_dense.shape == (2, 5) assert sparse.issparse(X_trans_sparse) assert not sparse.issparse(X_trans_dense) # check outcome assert_array_equal(X_trans_sparse.toarray(), [[0., 1., 0., 1., 1.], [1., 0., 1., 0., 1.]]) assert_array_equal(X_trans_sparse.toarray(), X_trans_dense)
def test_one_hot_encoder_inverse(sparse_, drop): X = [['abc', 2, 55], ['def', 1, 55], ['abc', 3, 55]] enc = OneHotEncoder(sparse=sparse_, drop=drop) X_tr = enc.fit_transform(X) exp = np.array(X, dtype=object) assert_array_equal(enc.inverse_transform(X_tr), exp) X = [[2, 55], [1, 55], [3, 55]] enc = OneHotEncoder(sparse=sparse_, categories='auto', drop=drop) X_tr = enc.fit_transform(X) exp = np.array(X) assert_array_equal(enc.inverse_transform(X_tr), exp) if drop is None: # with unknown categories # drop is incompatible with handle_unknown=ignore X = [['abc', 2, 55], ['def', 1, 55], ['abc', 3, 55]] enc = OneHotEncoder(sparse=sparse_, handle_unknown='ignore', categories=[['abc', 'def'], [1, 2], [54, 55, 56]]) X_tr = enc.fit_transform(X) exp = np.array(X, dtype=object) exp[2, 1] = None assert_array_equal(enc.inverse_transform(X_tr), exp) # with an otherwise numerical output, still object if unknown X = [[2, 55], [1, 55], [3, 55]] enc = OneHotEncoder(sparse=sparse_, categories=[[1, 2], [54, 56]], handle_unknown='ignore') X_tr = enc.fit_transform(X) exp = np.array(X, dtype=object) exp[2, 0] = None exp[:, 1] = None assert_array_equal(enc.inverse_transform(X_tr), exp) # incorrect shape raises X_tr = np.array([[0, 1, 1], [1, 0, 1]]) msg = re.escape('Shape of the passed X data is not correct') with pytest.raises(ValueError, match=msg): enc.inverse_transform(X_tr)
def test_one_hot_encoder_specified_categories_mixed_columns(): # multiple columns X = np.array([['a', 'b'], [0, 2]], dtype=object).T enc = OneHotEncoder(categories=[['a', 'b', 'c'], [0, 1, 2]]) exp = np.array([[1., 0., 0., 1., 0., 0.], [0., 1., 0., 0., 0., 1.]]) assert_array_equal(enc.fit_transform(X).toarray(), exp) assert enc.categories_[0].tolist() == ['a', 'b', 'c'] assert np.issubdtype(enc.categories_[0].dtype, np.object_) assert enc.categories_[1].tolist() == [0, 1, 2] # integer categories but from object dtype data assert np.issubdtype(enc.categories_[1].dtype, np.object_)
def test_one_hot_encoder_raise_missing(X, as_data_frame, handle_unknown): if as_data_frame: pd = pytest.importorskip('pandas') X = pd.DataFrame(X) ohe = OneHotEncoder(categories='auto', handle_unknown=handle_unknown) with pytest.raises(ValueError, match="Input contains NaN"): ohe.fit(X) with pytest.raises(ValueError, match="Input contains NaN"): ohe.fit_transform(X) if as_data_frame: X_partial = X.iloc[:1, :] else: X_partial = X[:1, :] ohe.fit(X_partial) with pytest.raises(ValueError, match="Input contains NaN"): ohe.transform(X)
def test_one_hot_encoder_drop_manual(): cats_to_drop = ['def', 12, 3, 56] enc = OneHotEncoder(drop=cats_to_drop) X = [['abc', 12, 2, 55], ['def', 12, 1, 55], ['def', 12, 3, 56]] trans = enc.fit_transform(X).toarray() exp = [[1, 0, 1, 1], [0, 1, 0, 1], [0, 0, 0, 0]] assert_array_equal(trans, exp) dropped_cats = [cat[feature] for cat, feature in zip(enc.categories_, enc.drop_idx_)] assert_array_equal(dropped_cats, cats_to_drop) assert_array_equal(np.array(X, dtype=object), enc.inverse_transform(trans))
def test_one_hot_encoder_specified_categories(X, X2, cats, cat_dtype): enc = OneHotEncoder(categories=cats) exp = np.array([[1., 0., 0.], [0., 1., 0.]]) assert_array_equal(enc.fit_transform(X).toarray(), exp) assert list(enc.categories[0]) == list(cats[0]) assert enc.categories_[0].tolist() == list(cats[0]) # manually specified categories should have same dtype as # the data when coerced from lists assert enc.categories_[0].dtype == cat_dtype # when specifying categories manually, unknown categories should already # raise when fitting enc = OneHotEncoder(categories=cats) with pytest.raises(ValueError, match="Found unknown categories"): enc.fit(X2) enc = OneHotEncoder(categories=cats, handle_unknown='ignore') exp = np.array([[1., 0., 0.], [0., 0., 0.]]) assert_array_equal(enc.fit(X2).transform(X2).toarray(), exp)