def test_one_hot_encoder_not_fitted(): X = np.array([['a'], ['b']]) enc = OneHotEncoder(categories=['a', 'b']) msg = ("This OneHotEncoder instance is not fitted yet. " "Call 'fit' with appropriate arguments before using this method.") with pytest.raises(NotFittedError, match=msg): enc.transform(X)
def test_one_hot_encoder_feature_names_drop(drop, expected_names): X = [['c', 2, 'a'], ['b', 2, 'b']] ohe = OneHotEncoder(drop=drop) ohe.fit(X) feature_names = ohe.get_feature_names() assert isinstance(feature_names, np.ndarray) assert_array_equal(expected_names, feature_names)
def test_one_hot_encoder(X): Xtr = check_categorical_onehot(np.array(X)[:, [0]]) assert_allclose(Xtr, [[0, 1], [1, 0]]) Xtr = check_categorical_onehot(np.array(X)[:, [0, 1]]) assert_allclose(Xtr, [[0, 1, 1, 0], [1, 0, 0, 1]]) Xtr = OneHotEncoder(categories='auto').fit_transform(X) assert_allclose(Xtr.toarray(), [[0, 1, 1, 0, 1], [1, 0, 0, 1, 1]])
def test_one_hot_encoder_categories(X, cat_exp, cat_dtype): # order of categories should not depend on order of samples for Xi in [X, X[::-1]]: enc = OneHotEncoder(categories='auto') enc.fit(Xi) # assert enc.categories == 'auto' assert isinstance(enc.categories_, list) for res, exp in zip(enc.categories_, cat_exp): assert res.tolist() == exp assert np.issubdtype(res.dtype, cat_dtype)
def test_one_hot_encoder_specified_categories_mixed_columns(): # multiple columns X = np.array([['a', 'b'], [0, 2]], dtype=object).T enc = OneHotEncoder(categories=[['a', 'b', 'c'], [0, 1, 2]]) exp = np.array([[1., 0., 0., 1., 0., 0.], [0., 1., 0., 0., 0., 1.]]) assert_array_equal(enc.fit_transform(X).toarray(), exp) assert enc.categories_[0].tolist() == ['a', 'b', 'c'] assert np.issubdtype(enc.categories_[0].dtype, np.object_) assert enc.categories_[1].tolist() == [0, 1, 2] # integer categories but from object dtype data assert np.issubdtype(enc.categories_[1].dtype, np.object_)
def test_one_hot_encoder_drop_manual(): cats_to_drop = ['def', 12, 3, 56] enc = OneHotEncoder(drop=cats_to_drop) X = [['abc', 12, 2, 55], ['def', 12, 1, 55], ['def', 12, 3, 56]] trans = enc.fit_transform(X).toarray() exp = [[1, 0, 1, 1], [0, 1, 0, 1], [0, 0, 0, 0]] assert_array_equal(trans, exp) dropped_cats = [ cat[feature] for cat, feature in zip(enc.categories_, enc.drop_idx_) ] assert_array_equal(dropped_cats, cats_to_drop) assert_array_equal(np.array(X, dtype=object), enc.inverse_transform(trans))
def test_one_hot_encoder_set_params(): X = np.array([[1, 2]]).T oh = OneHotEncoder() # set params on not yet fitted object oh.set_params(categories=[[0, 1, 2, 3]]) assert oh.get_params()['categories'] == [[0, 1, 2, 3]] assert oh.fit_transform(X).toarray().shape == (2, 4) # set params on already fitted object oh.set_params(categories=[[0, 1, 2, 3, 4]]) assert oh.fit_transform(X).toarray().shape == (2, 5)
def test_one_hot_encoder_feature_names(): enc = OneHotEncoder() X = [['Male', 1, 'girl', 2, 3], ['Female', 41, 'girl', 1, 10], ['Male', 51, 'boy', 12, 3], ['Male', 91, 'girl', 21, 30]] enc.fit(X) feature_names = enc.get_feature_names() assert isinstance(feature_names, np.ndarray) assert_array_equal([ 'x0_Female', 'x0_Male', 'x1_1', 'x1_41', 'x1_51', 'x1_91', 'x2_boy', 'x2_girl', 'x3_1', 'x3_2', 'x3_12', 'x3_21', 'x4_3', 'x4_10', 'x4_30' ], feature_names) feature_names2 = enc.get_feature_names( ['one', 'two', 'three', 'four', 'five']) assert_array_equal([ 'one_Female', 'one_Male', 'two_1', 'two_41', 'two_51', 'two_91', 'three_boy', 'three_girl', 'four_1', 'four_2', 'four_12', 'four_21', 'five_3', 'five_10', 'five_30' ], feature_names2) with pytest.raises(ValueError, match="input_features should have length"): enc.get_feature_names(['one', 'two'])
def test_one_hot_encoder_handle_unknown_strings(): X = np.array(['11111111', '22', '333', '4444']).reshape((-1, 1)) X2 = np.array(['55555', '22']).reshape((-1, 1)) # Non Regression test for the issue #12470 # Test the ignore option, when categories are numpy string dtype # particularly when the known category strings are larger # than the unknown category strings oh = OneHotEncoder(handle_unknown='ignore') oh.fit(X) X2_passed = X2.copy() assert_array_equal( oh.transform(X2_passed).toarray(), np.array([[0., 0., 0., 0.], [0., 1., 0., 0.]])) # ensure transformed data was not modified in place assert_array_equal(X2, X2_passed)
def test_permutation_importance_mixed_types_pandas(): pd = pytest.importorskip("pandas") rng = np.random.RandomState(42) n_repeats = 5 # Last column is correlated with y X = pd.DataFrame({ 'col1': [1.0, 2.0, 3.0, np.nan], 'col2': ['a', 'b', 'a', 'b'] }) y = np.array([0, 1, 0, 1]) num_preprocess = make_pipeline(SimpleImputer(), StandardScaler()) preprocess = ColumnTransformer([('num', num_preprocess, ['col1']), ('cat', OneHotEncoder(), ['col2'])]) clf = make_pipeline(preprocess, LogisticRegression(solver='lbfgs')) clf.fit(X, y) result = permutation_importance(clf, X, y, n_repeats=n_repeats, random_state=rng) assert result.importances.shape == (X.shape[1], n_repeats) # the correlated feature with y is the last column and should # have the highest importance assert np.all(result.importances_mean[-1] > result.importances_mean[:-1])
def test_encode_options(): est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode='ordinal').fit(X) Xt_1 = est.transform(X) est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode='onehot-dense').fit(X) Xt_2 = est.transform(X) assert not sp.issparse(Xt_2) assert_array_equal( OneHotEncoder(categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse=False).fit_transform(Xt_1), Xt_2) est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode='onehot').fit(X) Xt_3 = est.transform(X) assert sp.issparse(Xt_3) assert_array_equal( OneHotEncoder(categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse=True).fit_transform(Xt_1).toarray(), Xt_3.toarray())
def test_X_is_not_1D_pandas(method): pd = pytest.importorskip('pandas') X = pd.Series([6, 3, 4, 6]) oh = OneHotEncoder() msg = ("Expected 2D array, got 1D array instead") with pytest.raises(ValueError, match=msg): getattr(oh, method)(X)
def test_one_hot_encoder_specified_categories(X, X2, cats, cat_dtype): enc = OneHotEncoder(categories=cats) exp = np.array([[1., 0., 0.], [0., 1., 0.]]) assert_array_equal(enc.fit_transform(X).toarray(), exp) assert list(enc.categories[0]) == list(cats[0]) assert enc.categories_[0].tolist() == list(cats[0]) # manually specified categories should have same dtype as # the data when coerced from lists assert enc.categories_[0].dtype == cat_dtype # when specifying categories manually, unknown categories should already # raise when fitting enc = OneHotEncoder(categories=cats) with pytest.raises(ValueError, match="Found unknown categories"): enc.fit(X2) enc = OneHotEncoder(categories=cats, handle_unknown='ignore') exp = np.array([[1., 0., 0.], [0., 0., 0.]]) assert_array_equal(enc.fit(X2).transform(X2).toarray(), exp)
def test_one_hot_encoder_dtype(input_dtype, output_dtype): X = np.asarray([[0, 1]], dtype=input_dtype).T X_expected = np.asarray([[1, 0], [0, 1]], dtype=output_dtype) oh = OneHotEncoder(categories='auto', dtype=output_dtype) assert_array_equal(oh.fit_transform(X).toarray(), X_expected) assert_array_equal(oh.fit(X).transform(X).toarray(), X_expected) oh = OneHotEncoder(categories='auto', dtype=output_dtype, sparse=False) assert_array_equal(oh.fit_transform(X), X_expected) assert_array_equal(oh.fit(X).transform(X), X_expected)
def test_column_transformer_negative_column_indexes(): X = np.random.randn(2, 2) X_categories = np.array([[1], [2]]) X = np.concatenate([X, X_categories], axis=1) ohe = OneHotEncoder() tf_1 = ColumnTransformer([('ohe', ohe, [-1])], remainder='passthrough') tf_2 = ColumnTransformer([('ohe', ohe, [2])], remainder='passthrough') assert_array_equal(tf_1.fit_transform(X), tf_2.fit_transform(X))
def test_one_hot_encoder_dtype_pandas(output_dtype): pd = pytest.importorskip('pandas') X_df = pd.DataFrame({'A': ['a', 'b'], 'B': [1, 2]}) X_expected = np.array([[1, 0, 1, 0], [0, 1, 0, 1]], dtype=output_dtype) oh = OneHotEncoder(dtype=output_dtype) assert_array_equal(oh.fit_transform(X_df).toarray(), X_expected) assert_array_equal(oh.fit(X_df).transform(X_df).toarray(), X_expected) oh = OneHotEncoder(dtype=output_dtype, sparse=False) assert_array_equal(oh.fit_transform(X_df), X_expected) assert_array_equal(oh.fit(X_df).transform(X_df), X_expected)
def test_column_transformer_mixed_cols_sparse(): df = np.array([['a', 1, True], ['b', 2, False]], dtype='O') ct = make_column_transformer((OneHotEncoder(), [0]), ('passthrough', [1, 2]), sparse_threshold=1.0) # this shouldn't fail, since boolean can be coerced into a numeric # See: https://github.com/scikit-learn/scikit-learn/issues/11912 X_trans = ct.fit_transform(df) assert X_trans.getformat() == 'csr' assert_array_equal(X_trans.toarray(), np.array([[1, 0, 1, 1], [0, 1, 2, 0]])) ct = make_column_transformer((OneHotEncoder(), [0]), ('passthrough', [0]), sparse_threshold=1.0) with pytest.raises(ValueError, match="For a sparse output, all columns should"): # this fails since strings `a` and `b` cannot be # coerced into a numeric. ct.fit_transform(df)
def test_one_hot_encoder_raise_missing(X, as_data_frame, handle_unknown): if as_data_frame: pd = pytest.importorskip('pandas') X = pd.DataFrame(X) ohe = OneHotEncoder(categories='auto', handle_unknown=handle_unknown) with pytest.raises(ValueError, match="Input contains NaN"): ohe.fit(X) with pytest.raises(ValueError, match="Input contains NaN"): ohe.fit_transform(X) if as_data_frame: X_partial = X.iloc[:1, :] else: X_partial = X[:1, :] ohe.fit(X_partial) with pytest.raises(ValueError, match="Input contains NaN"): ohe.transform(X)
def test_categories(density, drop): ohe_base = OneHotEncoder(sparse=density) ohe_test = OneHotEncoder(sparse=density, drop=drop) X = [['c', 1, 'a'], ['a', 2, 'b']] ohe_base.fit(X) ohe_test.fit(X) assert_array_equal(ohe_base.categories_, ohe_test.categories_) if drop == 'first': assert_array_equal(ohe_test.drop_idx_, 0) else: for drop_cat, drop_idx, cat_list in zip(drop, ohe_test.drop_idx_, ohe_test.categories_): assert cat_list[drop_idx] == drop_cat assert isinstance(ohe_test.drop_idx_, np.ndarray) assert ohe_test.drop_idx_.dtype == np.int_
def test_one_hot_encoder_feature_names_unicode(): enc = OneHotEncoder() X = np.array([['c❤t1', 'dat2']], dtype=object).T enc.fit(X) feature_names = enc.get_feature_names() assert_array_equal(['x0_c❤t1', 'x0_dat2'], feature_names) feature_names = enc.get_feature_names(input_features=['n👍me']) assert_array_equal(['n👍me_c❤t1', 'n👍me_dat2'], feature_names)
def test_column_transformer_list(): X_list = [[1, float('nan'), 'a'], [0, 0, 'b']] expected_result = np.array([ [1, float('nan'), 1, 0], [-1, 0, 0, 1], ]) ct = ColumnTransformer([ ('numerical', StandardScaler(), [0, 1]), ('categorical', OneHotEncoder(), [2]), ]) assert_array_equal(ct.fit_transform(X_list), expected_result) assert_array_equal(ct.fit(X_list).transform(X_list), expected_result)
def test_one_hot_encoder_unsorted_categories(): X = np.array([['a', 'b']], dtype=object).T enc = OneHotEncoder(categories=[['b', 'a', 'c']]) exp = np.array([[0., 1., 0.], [1., 0., 0.]]) assert_array_equal(enc.fit(X).transform(X).toarray(), exp) assert_array_equal(enc.fit_transform(X).toarray(), exp) assert enc.categories_[0].tolist() == ['b', 'a', 'c'] assert np.issubdtype(enc.categories_[0].dtype, np.object_) # unsorted passed categories still raise for numerical values X = np.array([[1, 2]]).T enc = OneHotEncoder(categories=[[2, 1, 3]]) msg = 'Unsorted categories are not supported' with pytest.raises(ValueError, match=msg): enc.fit_transform(X)
def test_one_hot_encoder_diff_n_features(): X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]]) X2 = np.array([[1, 0]]) enc = OneHotEncoder() enc.fit(X) err_msg = ("The number of features in X is different to the number of " "features of the fitted data.") with pytest.raises(ValueError, match=err_msg): enc.transform(X2)
def test_encoder_dtypes_pandas(): # check dtype (similar to test_categorical_encoder_dtypes for dataframes) pd = pytest.importorskip('pandas') enc = OneHotEncoder(categories='auto') exp = np.array([[1., 0., 1., 0., 1., 0.], [0., 1., 0., 1., 0., 1.]], dtype='float64') X = pd.DataFrame({'A': [1, 2], 'B': [3, 4], 'C': [5, 6]}, dtype='int64') enc.fit(X) assert all([enc.categories_[i].dtype == 'int64' for i in range(2)]) assert_array_equal(enc.transform(X).toarray(), exp) X = pd.DataFrame({'A': [1, 2], 'B': ['a', 'b'], 'C': [3., 4.]}) X_type = [X['A'].dtype, X['B'].dtype, X['C'].dtype] enc.fit(X) assert all([enc.categories_[i].dtype == X_type[i] for i in range(3)]) assert_array_equal(enc.transform(X).toarray(), exp)
def check_categorical_onehot(X): enc = OneHotEncoder(categories='auto') Xtr1 = enc.fit_transform(X) enc = OneHotEncoder(categories='auto', sparse=False) Xtr2 = enc.fit_transform(X) assert_allclose(Xtr1.toarray(), Xtr2) assert sparse.isspmatrix_csr(Xtr1) return Xtr1.toarray()
def test_column_transformer_sparse_threshold(): X_array = np.array([['a', 'b'], ['A', 'B']], dtype=object).T # above data has sparsity of 4 / 8 = 0.5 # apply threshold even if all sparse col_trans = ColumnTransformer([('trans1', OneHotEncoder(), [0]), ('trans2', OneHotEncoder(), [1])], sparse_threshold=0.2) res = col_trans.fit_transform(X_array) assert not sparse.issparse(res) assert not col_trans.sparse_output_ # mixed -> sparsity of (4 + 2) / 8 = 0.75 for thres in [0.75001, 1]: col_trans = ColumnTransformer( [('trans1', OneHotEncoder(sparse=True), [0]), ('trans2', OneHotEncoder(sparse=False), [1])], sparse_threshold=thres) res = col_trans.fit_transform(X_array) assert sparse.issparse(res) assert col_trans.sparse_output_ for thres in [0.75, 0]: col_trans = ColumnTransformer( [('trans1', OneHotEncoder(sparse=True), [0]), ('trans2', OneHotEncoder(sparse=False), [1])], sparse_threshold=thres) res = col_trans.fit_transform(X_array) assert not sparse.issparse(res) assert not col_trans.sparse_output_ # if nothing is sparse -> no sparse for thres in [0.33, 0, 1]: col_trans = ColumnTransformer( [('trans1', OneHotEncoder(sparse=False), [0]), ('trans2', OneHotEncoder(sparse=False), [1])], sparse_threshold=thres) res = col_trans.fit_transform(X_array) assert not sparse.issparse(res) assert not col_trans.sparse_output_
def test_one_hot_encoder_sparse_dense(): # check that sparse and dense will give the same results X = np.array([[3, 2, 1], [0, 1, 1]]) enc_sparse = OneHotEncoder() enc_dense = OneHotEncoder(sparse=False) X_trans_sparse = enc_sparse.fit_transform(X) X_trans_dense = enc_dense.fit_transform(X) assert X_trans_sparse.shape == (2, 5) assert X_trans_dense.shape == (2, 5) assert sparse.issparse(X_trans_sparse) assert not sparse.issparse(X_trans_dense) # check outcome assert_array_equal(X_trans_sparse.toarray(), [[0., 1., 0., 1., 1.], [1., 0., 1., 0., 1.]]) assert_array_equal(X_trans_sparse.toarray(), X_trans_dense)
def test_invalid_drop_length(drop): enc = OneHotEncoder(drop=drop) err_msg = "`drop` should have length equal to the number" with pytest.raises(ValueError, match=err_msg): enc.fit([['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]])
test_size=0.5) # Unsupervised transformation based on totally random trees rt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator, random_state=0) rt_lm = LogisticRegression(max_iter=1000) pipeline = make_pipeline(rt, rt_lm) pipeline.fit(X_train, y_train) y_pred_rt = pipeline.predict_proba(X_test)[:, 1] fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt) # Supervised transformation based on random forests rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator) rf_enc = OneHotEncoder() rf_lm = LogisticRegression(max_iter=1000) rf.fit(X_train, y_train) rf_enc.fit(rf.apply(X_train)) rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr) y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1] fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm) # Supervised transformation based on gradient boosted trees grd = GradientBoostingClassifier(n_estimators=n_estimator) grd_enc = OneHotEncoder() grd_lm = LogisticRegression(max_iter=1000) grd.fit(X_train, y_train) grd_enc.fit(grd.apply(X_train)[:, :, 0]) grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)
# - fare: float. # Categorical Features: # - embarked: categories encoded as strings {'C', 'S', 'Q'}. # - sex: categories encoded as strings {'female', 'male'}. # - pclass: ordinal integers {1, 2, 3}. # We create the preprocessing pipelines for both numeric and categorical data. numeric_features = ['age', 'fare'] numeric_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]) categorical_features = ['embarked', 'sex', 'pclass'] categorical_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing') ), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer(transformers=[( 'num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features)]) # Append classifier to preprocessing pipeline. # Now we have a full prediction pipeline. clf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', LogisticRegression())]) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) clf.fit(X_train, y_train) print("model score: %.3f" % clf.score(X_test, y_test))