def test_encode_numerical_variables(df_enc_numeric): encoder = OneHotEncoder( top_categories=None, variables=None, drop_last=False, ignore_format=True, ) X = encoder.fit_transform(df_enc_numeric[["var_A", "var_B"]]) # test fit attr transf = { "var_A_1": [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "var_A_2": [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0], "var_A_3": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], "var_B_1": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "var_B_2": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0], "var_B_3": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], } transf = pd.DataFrame(transf).astype("int32") X = pd.DataFrame(X).astype("int32") assert encoder.variables_ == ["var_A", "var_B"] assert encoder.n_features_in_ == 2 # test transform output pd.testing.assert_frame_equal(X, transf)
def test_encode_into_kminus1_binary_plus_drop_binary(df_enc_binary): encoder = OneHotEncoder(top_categories=None, variables=None, drop_last=True, drop_last_binary=True) X = encoder.fit_transform(df_enc_binary) # test fit attr transf = { "target": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0], "var_A_A": [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "var_A_B": [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0], "var_B_A": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "var_B_B": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0], "var_C_A": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], } transf = pd.DataFrame(transf).astype("int32") X = pd.DataFrame(X).astype("int32") assert encoder.variables_ == ["var_A", "var_B", "var_C"] assert encoder.n_features_in_ == 4 # test transform output pd.testing.assert_frame_equal(X, transf) assert "var_C_B" not in X.columns
def test_encode_top_categories(df_enc_big): # test case 3: encode only the most popular categories encoder = OneHotEncoder(top_categories=4, variables=None, drop_last=False) X = encoder.fit_transform(df_enc_big) # test init params assert encoder.top_categories == 4 # test fit attr transf = { "var_A_D": 10, "var_A_B": 10, "var_A_A": 6, "var_A_G": 6, "var_B_A": 10, "var_B_D": 10, "var_B_G": 6, "var_B_B": 6, "var_C_D": 10, "var_C_C": 10, "var_C_G": 6, "var_C_B": 6, } assert encoder.n_features_in_ == 3 # test transform output for col in transf.keys(): assert X[col].sum() == transf[col] assert "var_B" not in X.columns assert "var_B_F" not in X.columns
def test_encode_categories_in_k_minus_1_binary_plus_list_of_variables(df_enc_big): # test case 2: encode all categories into k-1 binary variables, # pass list of variables encoder = OneHotEncoder( top_categories=None, variables=["var_A", "var_B"], drop_last=True ) X = encoder.fit_transform(df_enc_big) # test init params assert encoder.top_categories is None assert encoder.variables == ["var_A", "var_B"] assert encoder.drop_last is True # test fit attr transf = { "var_A_A": 6, "var_A_B": 10, "var_A_C": 4, "var_A_D": 10, "var_A_E": 2, "var_A_F": 2, "var_B_A": 10, "var_B_B": 6, "var_B_C": 4, "var_B_D": 10, "var_B_E": 2, "var_B_F": 2, } assert encoder.input_shape_ == (40, 3) # test transform output for col in transf.keys(): assert X[col].sum() == transf[col] assert "var_B" not in X.columns assert "var_B_G" not in X.columns assert "var_C" in X.columns
def test_raises_error_if_df_contains_na(df_enc_big, df_enc_big_na): # test case 4: when dataset contains na, fit method with pytest.raises(ValueError): encoder = OneHotEncoder() encoder.fit(df_enc_big_na) # test case 4: when dataset contains na, transform method with pytest.raises(ValueError): encoder = OneHotEncoder() encoder.fit(df_enc_big) encoder.transform(df_enc_big_na)
def test_encode_categories_in_k_binary_plus_select_vars_automatically( df_enc_big): # test case 1: encode all categories into k binary variables, select variables # automatically encoder = OneHotEncoder(top_categories=None, variables=None, drop_last=False) X = encoder.fit_transform(df_enc_big) # test init params assert encoder.top_categories is None assert encoder.variables is None assert encoder.drop_last is False # test fit attr transf = { "var_A_A": 6, "var_A_B": 10, "var_A_C": 4, "var_A_D": 10, "var_A_E": 2, "var_A_F": 2, "var_A_G": 6, "var_B_A": 10, "var_B_B": 6, "var_B_C": 4, "var_B_D": 10, "var_B_E": 2, "var_B_F": 2, "var_B_G": 6, "var_C_A": 4, "var_C_B": 6, "var_C_C": 10, "var_C_D": 10, "var_C_E": 2, "var_C_F": 2, "var_C_G": 6, } assert encoder.variables_ == ["var_A", "var_B", "var_C"] assert encoder.variables_binary_ == [] assert encoder.n_features_in_ == 3 assert encoder.encoder_dict_ == { "var_A": ["A", "B", "C", "D", "E", "F", "G"], "var_B": ["A", "B", "C", "D", "E", "F", "G"], "var_C": ["A", "B", "C", "D", "E", "F", "G"], } # test transform output assert X.sum().to_dict() == transf assert "var_A" not in X.columns
def test_get_feature_names_out_from_pipeline(df_enc_binary): original_features = ["var_num"] input_features = ["var_A", "var_B", "var_C", "var_D"] tr = Pipeline([("transformer", OneHotEncoder())]) tr.fit(df_enc_binary) out = [ "var_A_A", "var_A_B", "var_A_C", "var_B_A", "var_B_B", "var_B_C", "var_C_AHA", "var_C_UHU", "var_D_OHO", "var_D_EHE", ] assert tr.get_feature_names_out( input_features=None) == original_features + out assert tr.get_feature_names_out(input_features=input_features) == out assert tr.get_feature_names_out( input_features=input_features[0:2]) == out[0:6] assert tr.get_feature_names_out( input_features=[input_features[0]]) == out[0:3]
def test_encode_top_categories(): # test case 3: encode only the most popular categories df = pd.DataFrame({ "var_A": ["A"] * 5 + ["B"] * 11 + ["C"] * 4 + ["D"] * 9 + ["E"] * 2 + ["F"] * 2 + ["G"] * 7, "var_B": ["A"] * 11 + ["B"] * 7 + ["C"] * 4 + ["D"] * 9 + ["E"] * 2 + ["F"] * 2 + ["G"] * 5, "var_C": ["A"] * 4 + ["B"] * 5 + ["C"] * 11 + ["D"] * 9 + ["E"] * 2 + ["F"] * 2 + ["G"] * 7, }) encoder = OneHotEncoder(top_categories=4, variables=None, drop_last=False) X = encoder.fit_transform(df) # test init params assert encoder.top_categories == 4 # test fit attr transf = { "var_A_D": 9, "var_A_B": 11, "var_A_A": 5, "var_A_G": 7, "var_B_A": 11, "var_B_D": 9, "var_B_G": 5, "var_B_B": 7, "var_C_D": 9, "var_C_C": 11, "var_C_G": 7, "var_C_B": 5, } # test fit attr assert encoder.variables_ == ["var_A", "var_B", "var_C"] assert encoder.variables_binary_ == [] assert encoder.n_features_in_ == 3 assert encoder.encoder_dict_ == { "var_A": ["B", "D", "G", "A"], "var_B": ["A", "D", "B", "G"], "var_C": ["C", "D", "G", "B"], } # test transform output for col in transf.keys(): assert X[col].sum() == transf[col] assert "var_B" not in X.columns assert "var_B_F" not in X.columns
def test_encode_into_k_dummy_plus_drop_binary(df_enc_binary): encoder = OneHotEncoder(top_categories=None, variables=None, drop_last=False, drop_last_binary=True) X = encoder.fit_transform(df_enc_binary) X = X.astype("int32") # test fit attr transf = { "var_num": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0], "var_A_A": [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "var_A_B": [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0], "var_A_C": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], "var_B_A": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "var_B_B": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0], "var_B_C": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], "var_C_AHA": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0], "var_D_OHO": [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], } transf = pd.DataFrame(transf).astype("int32") assert encoder.variables_ == ["var_A", "var_B", "var_C", "var_D"] assert encoder.variables_binary_ == ["var_C", "var_D"] assert encoder.n_features_in_ == 5 assert encoder.encoder_dict_ == { "var_A": ["A", "B", "C"], "var_B": ["A", "B", "C"], "var_C": ["AHA"], "var_D": ["OHO"], } # test transform output pd.testing.assert_frame_equal(X, transf) assert "var_C_B" not in X.columns
def create_pipeline(params: dict = None): """ Create sklearn.pipeline.Pipeline Parameters ---------- params : dict dictionary of parameters for the pipeline Returns ------- sklearn.pipeline.Pipeline """ # pipeline for numeric variables p_num = Pipeline([("num_nan_ind", AddMissingIndicator(missing_only=True)), ("rmmean", MeanMedianImputer()), ("drop_quasi_constant", DropConstantFeatures(tol=0.97))]) # pipeline for categorical variables p_cat = Pipeline([("fill_cat_nas", CategoricalImputer(fill_value='MISSING')), ("rlc", RareLabelEncoder()), ("one_hot_encoder", OneHotEncoder())]) # list of pipelines to combine transformers = [("num", p_num, make_column_selector(dtype_include=np.number)), ("cat", p_cat, make_column_selector(dtype_include=object))] # combine pipelines and add XGBClassifier col_transforms = ColumnTransformer(transformers) p = Pipeline([("col_transformers", col_transforms), ("xgb", XGBClassifier(min_child_weight=1, gamma=0, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=1, gpu_id=0, tree_method='gpu_hist'))]) if params: p.set_params(**params) return p
#get numerical labels numerical_labels = list(X_train._get_numeric_data().columns) categorical_labels = X_train.select_dtypes( include=['object']).columns.tolist() #moving 'MSSubClass' feature from numerical to categorical numerical_labels.remove('MSSubClass') categorical_labels.append('MSSubClass') print(f'Numerical labels are (contains ordinal cat):{numerical_labels}') print(f'Categorical labels are:{categorical_labels}') #print(X_train.head()) num_pipeline = Pipeline([ ('imputer', MeanMedianImputer(imputation_method='median')) #, #('std_scaler',StandardScaler()) ]) cat_pipeline = Pipeline([('imputer', CategoricalImputer(imputation_method='missing', fill_value='Missing')), ('one_hot', OneHotEncoder(top_categories=None, drop_last=False))]) full_pipeline = ColumnTransformer([('num', num_pipeline, numerical_labels), ('cat', cat_pipeline, categorical_labels)]) X_converted = cat_pipeline.fit_transform(X_train) print(X_converted.head())
security = word_convert( st.selectbox('Do you need security guards ?', ('Yes', 'No'))) furnished = word_convert( st.selectbox('Do you want the house to be furnished ?', ('Yes', 'No'))) security_doors = word_convert( st.selectbox('Do you want security doors ?', ('Yes', 'No'))) cctv = word_convert( st.selectbox('Do you want CCTV surveillance ?', ('Yes', 'No'))) bq = word_convert(st.selectbox('Do you want Boys Quarters ?', ('Yes', 'No'))) gym = word_convert(st.selectbox('Do you need gym facilities ?', ('Yes', 'No'))) pool = word_convert(st.selectbox('Do you need swimming pool ?', ('Yes', 'No'))) # Modeling step # Encoding Step encode = OneHotEncoder() target = data['Price'] features = data.drop('Price', 1) encode.fit(features) features = encode.transform(features) # Getting the target and features variables # print(data.head()) X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=0) # Creating the algorithm class model = RandomForestRegressor()
def encorder(self, y): """Y dataframe""" encode = OneHotEncoder() encode.fit(y) return encode.transform(y)
def test_error_if_top_categories_not_integer(): with pytest.raises(ValueError): OneHotEncoder(top_categories=0.5)
def test_error_if_drop_last_not_bool(): with pytest.raises(ValueError): OneHotEncoder(drop_last=0.5)
CategoricalImputer(fill_value=0, ignore_format=True), EndTailImputer(), AddMissingIndicator(), RandomSampleImputer(), DropMissingData(), ]) def test_sklearn_compatible_imputer(estimator, check): check(estimator) # encoding @parametrize_with_checks([ CountFrequencyEncoder(ignore_format=True), DecisionTreeEncoder(regression=False, ignore_format=True), MeanEncoder(ignore_format=True), OneHotEncoder(ignore_format=True), OrdinalEncoder(ignore_format=True), RareLabelEncoder( tol=0.00000000001, n_categories=100000000000, replace_with=10, ignore_format=True, ), WoEEncoder(ignore_format=True), PRatioEncoder(ignore_format=True), ]) def test_sklearn_compatible_encoder(estimator, check): check(estimator) # outliers
from flask import Flask, render_template, request import pickle from pandas import DataFrame from sklearn.base import BaseEstimator from sklearn.pipeline import Pipeline from feature_engine.encoding import OneHotEncoder from feature_engine.wrappers import SklearnTransformerWrapper from sklearn.preprocessing import MinMaxScaler ########################################################################################### #preprocessing pipeline cat_features = ['Fuel_Type', 'Seller_Type', 'Transmission'] pipe = Pipeline(steps=[('one hot encoder', OneHotEncoder(variables=cat_features)), ('min max scaler', SklearnTransformerWrapper(variables=['Kms_Driven'], transformer=MinMaxScaler( feature_range=(0, 100))))]) ########################################################################################### with open("pipe.pkl", "rb") as f: preprocessor = pickle.load(f) with open("best_model.pkl", "rb") as f: model = pickle.load(f) def predict_price(ex): columns = [ 'Year', 'Present_Price', 'Kms_Driven', 'Fuel_Type', 'Seller_Type',
def test_get_feature_names_out(df_enc_binary): original_features = ["var_num"] input_features = ["var_A", "var_B", "var_C", "var_D"] tr = OneHotEncoder() tr.fit(df_enc_binary) out = [ "var_A_A", "var_A_B", "var_A_C", "var_B_A", "var_B_B", "var_B_C", "var_C_AHA", "var_C_UHU", "var_D_OHO", "var_D_EHE", ] assert tr.get_feature_names_out( input_features=None) == original_features + out assert tr.get_feature_names_out(input_features=input_features) == out assert tr.get_feature_names_out( input_features=input_features[0:2]) == out[0:6] assert tr.get_feature_names_out( input_features=[input_features[0]]) == out[0:3] tr = OneHotEncoder(drop_last=True) tr.fit(df_enc_binary) out = [ "var_A_A", "var_A_B", "var_B_A", "var_B_B", "var_C_AHA", "var_D_OHO", ] assert tr.get_feature_names_out( input_features=None) == original_features + out assert tr.get_feature_names_out(input_features=input_features) == out assert tr.get_feature_names_out( input_features=input_features[0:2]) == out[0:4] assert tr.get_feature_names_out( input_features=[input_features[0]]) == out[0:2] tr = OneHotEncoder(drop_last_binary=True) tr.fit(df_enc_binary) out = [ "var_A_A", "var_A_B", "var_A_C", "var_B_A", "var_B_B", "var_B_C", "var_C_AHA", "var_D_OHO", ] assert tr.get_feature_names_out( input_features=None) == original_features + out assert tr.get_feature_names_out(input_features=input_features) == out assert tr.get_feature_names_out( input_features=[input_features[0]]) == out[0:3] assert tr.get_feature_names_out(input_features=[input_features[3]]) == [ out[-1] ] tr = OneHotEncoder(top_categories=1) tr.fit(df_enc_binary) out = ["var_A_B", "var_B_A", "var_C_AHA", "var_D_EHE"] assert tr.get_feature_names_out( input_features=None) == original_features + out assert tr.get_feature_names_out(input_features=input_features) == out assert tr.get_feature_names_out( input_features=input_features[0:2]) == out[0:2] assert tr.get_feature_names_out(input_features=[input_features[3]]) == [ out[3] ] with pytest.raises(ValueError): tr.get_feature_names_out("var_A") with pytest.raises(ValueError): tr.get_feature_names_out(["var_A", "hola"])
early_stop = EarlyStopping(monitor='val_loss', mode='min', min_delta=0, verbose=1, patience=20) pump_pipeline = Pipeline( steps=[("feature_to_keeper", pp.FeatureKeeper(variables_to_keep=config.VARIABLES_TO_KEEP)), ("missing_imputer", pp.MissingImputer(numerical_variables=config.NUMERICAL_VARIABLES)), ("yeoJohnson", YeoJohnsonTransformer(variables=config.YEO_JHONSON_VARIABLES)), ("discretization", EqualWidthDiscretiser(bins=5, variables=config.NUMERICAL_VARIABLES) ), ("categorical_grouper", pp.CategoricalGrouping(config_dict=config.VARIABLES_TO_GROUP)), ("rareCategories_grouper", pp.RareCategoriesGrouping(threshold=config.VARIABLES_THRESHOLD)), ("one_hot_encoder", OneHotEncoder(variables=config.REAL_CATEGORICAL_VARIABLES, drop_last=False)), ("scaler", MinMaxScaler()), ("model", KerasClassifier(build_fn=create_model, epochs=1, validation_split=0.2, batch_size=256, verbose=1, callbacks=[early_stop, reduce_lr], shuffle=True))])
# %% titanic_train_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test) titanic_train_crosstab # %% # Criando um modelo com todas as features e usando pipeline num_features = df.select_dtypes(include=['int64', 'float64']).drop( 'Survived', axis=1).columns num_features # %% cat_features = df.select_dtypes(include=['category', 'object']).columns cat_features #%% features = df.drop('Survived', axis=1).columns.to_list() features # %% onehot = OneHotEncoder(variables=['Pclass', 'Sex', 'Embarked'], drop_last=False) # %% onehot.fit(df[features]) onehot.transform(df[features]).head() # %% X = onehot.transform(df[features]) y = df['Survived'] print(X.shape) # %% # Separate into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)