def test_error_if_input_df_contains_categories_not_present_in_training_df( df_enc, df_enc_rare): # test case 4: when dataset to be transformed contains categories not present # in training dataset msg = "During the encoding, NaN values were introduced in the feature(s) var_A." # check for warning when rare_labels equals 'ignore' with pytest.warns(UserWarning) as record: encoder = OrdinalEncoder(errors="ignore") encoder.fit(df_enc[["var_A", "var_B"]], df_enc["target"]) encoder.transform(df_enc_rare[["var_A", "var_B"]]) # check that only one warning was raised assert len(record) == 1 # check that the message matches assert record[0].message.args[0] == msg # check for error when rare_labels equals 'raise' with pytest.raises(ValueError) as record: encoder = OrdinalEncoder(errors="raise") encoder.fit(df_enc[["var_A", "var_B"]], df_enc["target"]) encoder.transform(df_enc_rare[["var_A", "var_B"]]) # check that the error message matches assert str(record.value) == msg
def test_error_if_input_df_contains_categories_not_present_in_training_df( df_enc, df_enc_rare): # test case 4: when dataset to be transformed contains categories not present # in training dataset with pytest.warns(UserWarning): encoder = OrdinalEncoder(encoding_method="arbitrary") encoder.fit(df_enc) encoder.transform(df_enc_rare)
def test_variables_cast_as_category(df_enc_category_dtypes): df = df_enc_category_dtypes.copy() encoder = OrdinalEncoder(encoding_method="ordered", variables=["var_A"]) encoder.fit(df[["var_A", "var_B"]], df["target"]) X = encoder.transform(df[["var_A", "var_B"]]) # expected output transf_df = df.copy() transf_df["var_A"] = [ 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2 ] # test transform output pd.testing.assert_frame_equal(X, transf_df[["var_A", "var_B"]], check_dtype=False) assert X["var_A"].dtypes == int
def test_ordered_encoding_1_variable(df_enc): # test case 1: 1 variable, ordered encoding encoder = OrdinalEncoder(encoding_method="ordered", variables=["var_A"]) encoder.fit(df_enc[["var_A", "var_B"]], df_enc["target"]) X = encoder.transform(df_enc[["var_A", "var_B"]]) # expected output transf_df = df_enc.copy() transf_df["var_A"] = [ 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2 ] # test init params assert encoder.encoding_method == "ordered" assert encoder.variables == ["var_A"] # test fit attr assert encoder.encoder_dict_ == {"var_A": {"A": 1, "B": 0, "C": 2}} assert encoder.input_shape_ == (20, 2) # test transform output pd.testing.assert_frame_equal(X, transf_df[["var_A", "var_B"]])
def feature_engineering_ordinal_encoding(X_train, y_train, X_test): encoder = OrdinalEncoder( encoding_method='ordered', variables=[ 'gender', # 'hypertension', #'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'smoking_status' ]) encoder.fit(X_train, y_train) train_t = encoder.transform(X_train) test_t = encoder.transform(X_test) return train_t, test_t
def test_ordered_encoding_1_variable_ignore_format(df_enc_numeric): encoder = OrdinalEncoder(encoding_method="ordered", variables=["var_A"], ignore_format=True) encoder.fit(df_enc_numeric[["var_A", "var_B"]], df_enc_numeric["target"]) X = encoder.transform(df_enc_numeric[["var_A", "var_B"]]) # expected output transf_df = df_enc_numeric.copy() transf_df["var_A"] = [ 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2 ] # test init params assert encoder.encoding_method == "ordered" assert encoder.variables == ["var_A"] # test fit attr assert encoder.variables_ == ["var_A"] assert encoder.encoder_dict_ == {"var_A": {1: 1, 2: 0, 3: 2}} assert encoder.n_features_in_ == 2 # test transform output pd.testing.assert_frame_equal(X, transf_df[["var_A", "var_B"]])
df.head() # Regularize data set df.price_per_size = df.price_per_size / 10000 df.price = df.price / 1000000 df.rent = df.rent / 1000 # Test train split X = df.drop(columns=['price'], axis=1) Y = df['price'] X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33) # Encoding the regions regions_df = np.asarray(X['region']).reshape(1, -1) enc = OrdinalEncoder(encoding_method='ordered', variables=['region']) enc.fit(X_train, y_train) X_train_enc = enc.transform(X_train) X_test_enc = enc.transform(X_test) # fit model no training data regressor = xgboost.XGBRegressor(n_estimators=100, reg_lambda=1, gamma=0, max_depth=3) regressor.fit(X_train_enc, y_train) # make predictions for test data y_pred = regressor.predict(X_test_enc) predictions = [round(value) for value in y_pred] # evaluate predictions
def test_error_if_ordinal_encoding_and_no_y_passed(df_enc): # test case 3: raises error if target is not passed with pytest.raises(ValueError): encoder = OrdinalEncoder(encoding_method="ordered") encoder.fit(df_enc)
def test_transform_raises_error_if_df_contains_na(df_enc, df_enc_na): # test case 4: when dataset contains na, transform method with pytest.raises(ValueError): encoder = OrdinalEncoder(encoding_method="arbitrary") encoder.fit(df_enc) encoder.transform(df_enc_na)