Exemplo n.º 1
0
def test_raises_error_if_df_contains_na(df_enc_big, df_enc_big_na):
    # test case 4: when dataset contains na, fit method
    with pytest.raises(ValueError):
        encoder = OneHotEncoder()
        encoder.fit(df_enc_big_na)

    # test case 4: when dataset contains na, transform method
    with pytest.raises(ValueError):
        encoder = OneHotEncoder()
        encoder.fit(df_enc_big)
        encoder.transform(df_enc_big_na)
Exemplo n.º 2
0
def test_get_feature_names_out(df_enc_binary):
    original_features = ["var_num"]
    input_features = ["var_A", "var_B", "var_C", "var_D"]

    tr = OneHotEncoder()
    tr.fit(df_enc_binary)

    out = [
        "var_A_A",
        "var_A_B",
        "var_A_C",
        "var_B_A",
        "var_B_B",
        "var_B_C",
        "var_C_AHA",
        "var_C_UHU",
        "var_D_OHO",
        "var_D_EHE",
    ]

    assert tr.get_feature_names_out(
        input_features=None) == original_features + out
    assert tr.get_feature_names_out(input_features=input_features) == out
    assert tr.get_feature_names_out(
        input_features=input_features[0:2]) == out[0:6]
    assert tr.get_feature_names_out(
        input_features=[input_features[0]]) == out[0:3]

    tr = OneHotEncoder(drop_last=True)
    tr.fit(df_enc_binary)

    out = [
        "var_A_A",
        "var_A_B",
        "var_B_A",
        "var_B_B",
        "var_C_AHA",
        "var_D_OHO",
    ]

    assert tr.get_feature_names_out(
        input_features=None) == original_features + out
    assert tr.get_feature_names_out(input_features=input_features) == out
    assert tr.get_feature_names_out(
        input_features=input_features[0:2]) == out[0:4]
    assert tr.get_feature_names_out(
        input_features=[input_features[0]]) == out[0:2]

    tr = OneHotEncoder(drop_last_binary=True)
    tr.fit(df_enc_binary)

    out = [
        "var_A_A",
        "var_A_B",
        "var_A_C",
        "var_B_A",
        "var_B_B",
        "var_B_C",
        "var_C_AHA",
        "var_D_OHO",
    ]

    assert tr.get_feature_names_out(
        input_features=None) == original_features + out
    assert tr.get_feature_names_out(input_features=input_features) == out
    assert tr.get_feature_names_out(
        input_features=[input_features[0]]) == out[0:3]
    assert tr.get_feature_names_out(input_features=[input_features[3]]) == [
        out[-1]
    ]

    tr = OneHotEncoder(top_categories=1)
    tr.fit(df_enc_binary)

    out = ["var_A_B", "var_B_A", "var_C_AHA", "var_D_EHE"]

    assert tr.get_feature_names_out(
        input_features=None) == original_features + out
    assert tr.get_feature_names_out(input_features=input_features) == out
    assert tr.get_feature_names_out(
        input_features=input_features[0:2]) == out[0:2]
    assert tr.get_feature_names_out(input_features=[input_features[3]]) == [
        out[3]
    ]

    with pytest.raises(ValueError):
        tr.get_feature_names_out("var_A")

    with pytest.raises(ValueError):
        tr.get_feature_names_out(["var_A", "hola"])
Exemplo n.º 3
0
    st.selectbox('Do you want the house to be furnished ?', ('Yes', 'No')))
security_doors = word_convert(
    st.selectbox('Do you want security doors ?', ('Yes', 'No')))
cctv = word_convert(
    st.selectbox('Do you want CCTV surveillance ?', ('Yes', 'No')))
bq = word_convert(st.selectbox('Do you want Boys Quarters ?', ('Yes', 'No')))
gym = word_convert(st.selectbox('Do you need gym facilities ?', ('Yes', 'No')))
pool = word_convert(st.selectbox('Do you need swimming pool ?', ('Yes', 'No')))

# Modeling step

# Encoding Step
encode = OneHotEncoder()
target = data['Price']
features = data.drop('Price', 1)
encode.fit(features)
features = encode.transform(features)

# Getting the target and features variables

# print(data.head())

X_train, X_test, y_train, y_test = train_test_split(features,
                                                    target,
                                                    test_size=0.2,
                                                    random_state=0)
# Creating the algorithm class
model = RandomForestRegressor()
# Creating algorithm object
model.fit(X_train, y_train)
# Predicted values
Exemplo n.º 4
0
# Criando um modelo com todas as features e usando pipeline
num_features = df.select_dtypes(include=['int64', 'float64']).drop(
    'Survived', axis=1).columns
num_features
# %%
cat_features = df.select_dtypes(include=['category', 'object']).columns
cat_features
#%%
features = df.drop('Survived', axis=1).columns.to_list()
features
# %%
onehot = OneHotEncoder(variables=['Pclass', 'Sex', 'Embarked'],
                       drop_last=False)

# %%
onehot.fit(df[features])
onehot.transform(df[features]).head()
# %%
X = onehot.transform(df[features])
y = df['Survived']
print(X.shape)
# %%
# Separate into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

logistic_model = LogisticRegression(penalty='l2', C=1.0,
                                    solver='liblinear').fit(X_train, y_train)
print(logistic_model)
 def encorder(self, y):
     """Y dataframe"""
     encode = OneHotEncoder()
     encode.fit(y)
     return encode.transform(y)