def test_arbitrary_encoding_automatically_find_variables(df_enc): # test case 2: automatically select variables, unordered encoding encoder = OrdinalEncoder(encoding_method="arbitrary", variables=None) X = encoder.fit_transform(df_enc) # expected output transf_df = df_enc.copy() transf_df["var_A"] = [ 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2 ] transf_df["var_B"] = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2 ] # test init params assert encoder.encoding_method == "arbitrary" assert encoder.variables is None # test fit attr assert encoder.variables_ == ["var_A", "var_B"] assert encoder.encoder_dict_ == { "var_A": { "A": 0, "B": 1, "C": 2 }, "var_B": { "A": 0, "B": 1, "C": 2 }, } assert encoder.n_features_in_ == 3 # test transform output pd.testing.assert_frame_equal(X, transf_df)
def test_variables_cast_as_category(df_enc_category_dtypes): df = df_enc_category_dtypes.copy() encoder = OrdinalEncoder(encoding_method="ordered", variables=["var_A"]) encoder.fit(df[["var_A", "var_B"]], df["target"]) X = encoder.transform(df[["var_A", "var_B"]]) # expected output transf_df = df.copy() transf_df["var_A"] = [ 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2 ] # test transform output pd.testing.assert_frame_equal(X, transf_df[["var_A", "var_B"]], check_dtype=False) assert X["var_A"].dtypes == int
def test_ordered_encoding_1_variable(df_enc): # test case 1: 1 variable, ordered encoding encoder = OrdinalEncoder(encoding_method="ordered", variables=["var_A"]) encoder.fit(df_enc[["var_A", "var_B"]], df_enc["target"]) X = encoder.transform(df_enc[["var_A", "var_B"]]) # expected output transf_df = df_enc.copy() transf_df["var_A"] = [ 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2 ] # test init params assert encoder.encoding_method == "ordered" assert encoder.variables == ["var_A"] # test fit attr assert encoder.encoder_dict_ == {"var_A": {"A": 1, "B": 0, "C": 2}} assert encoder.input_shape_ == (20, 2) # test transform output pd.testing.assert_frame_equal(X, transf_df[["var_A", "var_B"]])
def test_error_if_input_df_contains_categories_not_present_in_training_df( df_enc, df_enc_rare): # test case 4: when dataset to be transformed contains categories not present # in training dataset with pytest.warns(UserWarning): encoder = OrdinalEncoder(encoding_method="arbitrary") encoder.fit(df_enc) encoder.transform(df_enc_rare)
def test_ordered_encoding_1_variable_ignore_format(df_enc_numeric): encoder = OrdinalEncoder(encoding_method="ordered", variables=["var_A"], ignore_format=True) encoder.fit(df_enc_numeric[["var_A", "var_B"]], df_enc_numeric["target"]) X = encoder.transform(df_enc_numeric[["var_A", "var_B"]]) # expected output transf_df = df_enc_numeric.copy() transf_df["var_A"] = [ 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2 ] # test init params assert encoder.encoding_method == "ordered" assert encoder.variables == ["var_A"] # test fit attr assert encoder.variables_ == ["var_A"] assert encoder.encoder_dict_ == {"var_A": {1: 1, 2: 0, 3: 2}} assert encoder.n_features_in_ == 2 # test transform output pd.testing.assert_frame_equal(X, transf_df[["var_A", "var_B"]])
def test_arbitrary_encoding_automatically_find_variables_ignore_format( df_enc_numeric): encoder = OrdinalEncoder(encoding_method="arbitrary", variables=None, ignore_format=True) X = encoder.fit_transform(df_enc_numeric[["var_A", "var_B"]]) # expected output transf_df = df_enc_numeric[["var_A", "var_B"]].copy() transf_df["var_A"] = [ 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2 ] transf_df["var_B"] = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2 ] # test init params assert encoder.encoding_method == "arbitrary" assert encoder.variables is None # test fit attr assert encoder.variables_ == ["var_A", "var_B"] assert encoder.encoder_dict_ == { "var_A": { 1: 0, 2: 1, 3: 2 }, "var_B": { 1: 0, 2: 1, 3: 2 }, } assert encoder.n_features_in_ == 2 # test transform output pd.testing.assert_frame_equal(X, transf_df)
def feature_engineering_ordinal_encoding(X_train, y_train, X_test): encoder = OrdinalEncoder( encoding_method='ordered', variables=[ 'gender', # 'hypertension', #'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'smoking_status' ]) encoder.fit(X_train, y_train) train_t = encoder.transform(X_train) test_t = encoder.transform(X_test) return train_t, test_t
def test_error_if_input_df_contains_categories_not_present_in_training_df( df_enc, df_enc_rare): # test case 4: when dataset to be transformed contains categories not present # in training dataset msg = "During the encoding, NaN values were introduced in the feature(s) var_A." # check for warning when rare_labels equals 'ignore' with pytest.warns(UserWarning) as record: encoder = OrdinalEncoder(errors="ignore") encoder.fit(df_enc[["var_A", "var_B"]], df_enc["target"]) encoder.transform(df_enc_rare[["var_A", "var_B"]]) # check that only one warning was raised assert len(record) == 1 # check that the message matches assert record[0].message.args[0] == msg # check for error when rare_labels equals 'raise' with pytest.raises(ValueError) as record: encoder = OrdinalEncoder(errors="raise") encoder.fit(df_enc[["var_A", "var_B"]], df_enc["target"]) encoder.transform(df_enc_rare[["var_A", "var_B"]]) # check that the error message matches assert str(record.value) == msg
df = pd.read_csv("test_data.csv") df.head() # Regularize data set df.price_per_size = df.price_per_size / 10000 df.price = df.price / 1000000 df.rent = df.rent / 1000 # Test train split X = df.drop(columns=['price'], axis=1) Y = df['price'] X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33) # Encoding the regions regions_df = np.asarray(X['region']).reshape(1, -1) enc = OrdinalEncoder(encoding_method='ordered', variables=['region']) enc.fit(X_train, y_train) X_train_enc = enc.transform(X_train) X_test_enc = enc.transform(X_test) # fit model no training data regressor = xgboost.XGBRegressor(n_estimators=100, reg_lambda=1, gamma=0, max_depth=3) regressor.fit(X_train_enc, y_train) # make predictions for test data y_pred = regressor.predict(X_test_enc) predictions = [round(value) for value in y_pred]
def test_error_if_encoding_method_not_allowed(): with pytest.raises(ValueError): OrdinalEncoder(encoding_method="other")
def test_error_if_ordinal_encoding_and_no_y_passed(df_enc): # test case 3: raises error if target is not passed with pytest.raises(ValueError): encoder = OrdinalEncoder(encoding_method="ordered") encoder.fit(df_enc)
def test_error_if_rare_labels_not_permitted_value(): with pytest.raises(ValueError): OrdinalEncoder(errors="empanada")
"mapper_garage", pp.Mapper( variables=config.model_config.garage_vars, mappings=config.model_config.garage_mappings, ), ), # == CATEGORICAL ENCODING ( "rare_label_encoder", RareLabelEncoder(tol=0.01, n_categories=1, variables=config.model_config.categorical_vars), ), # encode categorical variables using the target mean ( "categorical_encoder", OrdinalEncoder( encoding_method="ordered", variables=config.model_config.categorical_vars, ), ), ("scaler", MinMaxScaler()), ( "Lasso", Lasso( alpha=config.model_config.alpha, random_state=config.model_config.random_state, ), ), ])
DecisionTreeEncoder, MeanEncoder, OneHotEncoder, OrdinalEncoder, PRatioEncoder, RareLabelEncoder, WoEEncoder, ) from tests.estimator_checks.estimator_checks import check_feature_engine_estimator _estimators = [ CountFrequencyEncoder(ignore_format=True), DecisionTreeEncoder(regression=False, ignore_format=True), MeanEncoder(ignore_format=True), OneHotEncoder(ignore_format=True), OrdinalEncoder(ignore_format=True), RareLabelEncoder( tol=0.00000000001, n_categories=100000000000, replace_with=10, ignore_format=True, ), WoEEncoder(ignore_format=True), PRatioEncoder(ignore_format=True), ] @pytest.mark.parametrize("estimator", _estimators) def test_check_estimator_from_sklearn(estimator): return check_estimator(estimator)
EndTailImputer(), AddMissingIndicator(), RandomSampleImputer(), DropMissingData(), ]) def test_sklearn_compatible_imputer(estimator, check): check(estimator) # encoding @parametrize_with_checks([ CountFrequencyEncoder(ignore_format=True), DecisionTreeEncoder(regression=False, ignore_format=True), MeanEncoder(ignore_format=True), OneHotEncoder(ignore_format=True), OrdinalEncoder(ignore_format=True), RareLabelEncoder( tol=0.00000000001, n_categories=100000000000, replace_with=10, ignore_format=True, ), WoEEncoder(ignore_format=True), PRatioEncoder(ignore_format=True), ]) def test_sklearn_compatible_encoder(estimator, check): check(estimator) # outliers @parametrize_with_checks([
rf_pipe = Pipeline( [ ('numeric_impute', MeanMedianImputer(imputation_method='median', variables=config.CONTINUOUS_FEATURES)), ('categorical_impute', CategoricalImputer(imputation_method='missing', variables=config.CATEGORICAL_FEATURES+ config.DISCRETE_SET1_FEATURES+config.DISCRETE_SET2_FEATURES+ config.DISCRETE_SET3_FEATURES)), ('rare_label_encode', RareLabelEncoder(tol=0.02, n_categories=10, variables=config.CATEGORICAL_FEATURES+ config.DISCRETE_SET1_FEATURES+config.DISCRETE_SET2_FEATURES+ config.DISCRETE_SET3_FEATURES, replace_with='Rare')), ('categorical_encode1', OrdinalEncoder(encoding_method='arbitrary', variables=config.CATEGORICAL_FEATURES+config.DISCRETE_SET2_FEATURES)), ('categorical_encode2', OrdinalEncoder(encoding_method='ordered', variables=config.DISCRETE_SET1_FEATURES)), ('categorical_encode3', CountFrequencyEncoder(encoding_method='count', variables=config.DISCRETE_SET3_FEATURES)), ('continuous_discretization', EqualFrequencyDiscretiser(q=20, variables=config.CONTINUOUS_FEATURES, return_object=True)), ('continuous_encoding', OrdinalEncoder(encoding_method='ordered', variables=config.CONTINUOUS_FEATURES)), ('scaling', StandardScaler()), ('clf', RandomForestClassifier(criterion='gini', max_depth=10, min_samples_split=10, random_state=0)) ])
def test_transform_raises_error_if_df_contains_na(df_enc, df_enc_na): # test case 4: when dataset contains na, transform method with pytest.raises(ValueError): encoder = OrdinalEncoder(encoding_method="arbitrary") encoder.fit(df_enc) encoder.transform(df_enc_na)
def test_non_fitted_error(df_enc): with pytest.raises(NotFittedError): imputer = OrdinalEncoder() imputer.transform(df_enc)