Exemplo n.º 1
0
def test_MeanCategoricalEncoder():
    # test dataframe
    df = {
        'category': ['A'] * 10 + ['B'] * 6 + ['C'] * 4,
        'target': [
            1,
            1,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            1,
            1,
            0,
            0,
            0,
            0,
            1,
            1,
            0,
            0,
        ]
    }
    df = pd.DataFrame(df)

    # transformed dataframe
    transf_df = {
        'category': [
            0.200000, 0.200000, 0.200000, 0.200000, 0.200000, 0.200000,
            0.200000, 0.200000, 0.200000, 0.200000, 0.333333, 0.333333,
            0.333333, 0.333333, 0.333333, 0.333333, 0.500000, 0.500000,
            0.500000, 0.500000
        ],
        'target': [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0]
    }
    transf_df = pd.DataFrame(transf_df)

    encoder = MeanCategoricalEncoder(variables=['category'])
    encoder.fit(df['category'].to_frame(), df['target'])
    X = encoder.transform(df['category'].to_frame())

    pd.testing.assert_frame_equal(X, transf_df['category'].to_frame())
    assert encoder.variables == ['category']
    assert encoder.encoder_dict_ == {
        'category': {
            'A': 0.20000000000000001,
            'B': 0.33333333333333331,
            'C': 0.5
        }
    }
    assert encoder.input_shape_ == (20, 1)
def train_models(ModelClass,
                 invoices,
                 observation_end_dates,
                 rfe=False,
                 **kwargs):
    train_results = dict(
        models=[],
        observation_end_dates=observation_end_dates,
        X_train=[],
        y_train=[],
        X_test=[],
        y_test=[],
    )
    X_trains, y_trains = [], [],

    for observation_end_date in observation_end_dates:
        X_train, y_train, X_test, y_test = get_train_test_data(
            invoices, observation_end_date)
        X_trains.append(X_train)
        y_trains.append(y_train)
        X_train, y_train = pd.concat(
            X_trains[-lag:]).reset_index(drop=True), pd.concat(
                y_trains[-lag:]).reset_index(drop=True).astype(
                    int)  # Last "lag" months used as training data
        X_test, y_test = X_test.reset_index(drop=True), y_test.reset_index(
            drop=True).astype(int)

        # Encode "MostBoughtItem" feature
        rare_encoder = RareLabelCategoricalEncoder(
            tol=0.02 if len(X_train) < 100 else 0.01,
            variables=['MostBoughtItem']).fit(X_train)
        X_train = rare_encoder.transform(X_train)
        X_test = rare_encoder.transform(X_test)
        mean_enc = MeanCategoricalEncoder(variables=['MostBoughtItem']).fit(
            X_train, y_train)
        X_train = mean_enc.transform(X_train)
        X_test = mean_enc.transform(X_test)

        if rfe:
            sel_ = RFE(ModelClass(**kwargs), n_features_to_select=8)
            sel_.fit(X_train, y_train)
            selected_feats = X_train.columns[(sel_.get_support())]
            model = ModelClass(**kwargs).fit(X_train[selected_feats], y_train)
            train_results['X_train'].append(X_train[selected_feats])
            train_results['X_test'].append(X_test[selected_feats])
        else:
            model = ModelClass(**kwargs)
            model.fit(X_train, y_train)
            train_results['X_train'].append(X_train)
            train_results['X_test'].append(X_test)
        train_results['models'].append(model)
        train_results['y_train'].append(y_train)
        train_results['y_test'].append(y_test)

    return train_results
Exemplo n.º 3
0
def test_MeanCategoricalEncoder(dataframe_enc, dataframe_enc_rare,
                                dataframe_enc_na):
    # test case 1: 1 variable
    encoder = MeanCategoricalEncoder(variables=['var_A'])
    encoder.fit(dataframe_enc[['var_A', 'var_B']], dataframe_enc['target'])
    X = encoder.transform(dataframe_enc[['var_A', 'var_B']])

    # transformed dataframe
    transf_df = dataframe_enc.copy()
    transf_df['var_A'] = [
        0.3333333333333333, 0.3333333333333333, 0.3333333333333333,
        0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.2, 0.2,
        0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.5, 0.5, 0.5, 0.5
    ]

    # init params
    assert encoder.variables == ['var_A']
    # fit params
    assert encoder.encoder_dict_ == {
        'var_A': {
            'A': 0.3333333333333333,
            'B': 0.2,
            'C': 0.5
        }
    }
    assert encoder.input_shape_ == (20, 2)
    # transform params
    pd.testing.assert_frame_equal(X, transf_df[['var_A', 'var_B']])

    # test case 2: automatically select variables
    encoder = MeanCategoricalEncoder(variables=None)
    encoder.fit(dataframe_enc[['var_A', 'var_B']], dataframe_enc['target'])
    X = encoder.transform(dataframe_enc[['var_A', 'var_B']])

    # transformed dataframe
    transf_df['var_A'] = [
        0.3333333333333333, 0.3333333333333333, 0.3333333333333333,
        0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.2, 0.2,
        0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.5, 0.5, 0.5, 0.5
    ]
    transf_df['var_B'] = [
        0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3333333333333333,
        0.3333333333333333, 0.3333333333333333, 0.3333333333333333,
        0.3333333333333333, 0.3333333333333333, 0.5, 0.5, 0.5, 0.5
    ]

    # init params
    assert encoder.variables == ['var_A', 'var_B']
    # fit params
    assert encoder.encoder_dict_ == {
        'var_A': {
            'A': 0.3333333333333333,
            'B': 0.2,
            'C': 0.5
        },
        'var_B': {
            'A': 0.2,
            'B': 0.3333333333333333,
            'C': 0.5
        }
    }
    assert encoder.input_shape_ == (20, 2)
    # transform params
    pd.testing.assert_frame_equal(X, transf_df[['var_A', 'var_B']])

    # test case 3: raises error if target is not passed
    with pytest.raises(TypeError):
        encoder = MeanCategoricalEncoder()
        encoder.fit(dataframe_enc)

    # test case 4: when dataset to be transformed contains categories not present in training dataset
    with pytest.warns(UserWarning):
        encoder = MeanCategoricalEncoder()
        encoder.fit(dataframe_enc[['var_A', 'var_B']], dataframe_enc['target'])
        encoder.transform(dataframe_enc_rare[['var_A', 'var_B']])

    # test case 4: when dataset contains na, fit method
    with pytest.raises(ValueError):
        encoder = MeanCategoricalEncoder()
        encoder.fit(dataframe_enc_na[['var_A', 'var_B']],
                    dataframe_enc_na['target'])

    # test case 4: when dataset contains na, transform method
    with pytest.raises(ValueError):
        encoder = MeanCategoricalEncoder()
        encoder.fit(dataframe_enc[['var_A', 'var_B']], dataframe_enc['target'])
        encoder.transform(dataframe_enc_na)

    with pytest.raises(NotFittedError):
        imputer = OrdinalCategoricalEncoder()
        imputer.transform(dataframe_enc)
Exemplo n.º 4
0
                            password=password)

homes = pd.read_sql('SELECT * FROM Homes;', con=db_connection)
homes.drop(['ParcelNumber', 'Address'], axis=1, inplace=True)

#Save the first dataframe that we will need for the application
homes.to_csv('Data/houses.csv', index=False)

#Create a copy of the homes dataframe
neighborhoods = homes.copy()

#We are going to create another column that stores the mean target value for each neighborhood
#First we need to save the neighborhood name into another column
neighborhoods['neighborhood_name'] = neighborhoods['Neighborhood']

mean_enc = MeanCategoricalEncoder(variables=['Neighborhood'])

#Fit the encoder
mean_enc.fit(neighborhoods, neighborhoods['SalePrice'])

#Transform the neighborhoods dataframe
neighborhoods = mean_enc.transform(neighborhoods)

#Load a dataframe that has the geocoordinates of each neighborhood
hoods = gpd.read_file('Los Angeles Neighborhood Map.geojson')
hoods = hoods.rename(columns={'name': 'neighborhood_name'})
hoods[['longitude', 'latitude']] = hoods[['longitude',
                                          'latitude']].astype(float)

#Create a dataframe that averages out every attribute by neighborhood
avg = neighborhoods.groupby('neighborhood_name')[[
Exemplo n.º 5
0
def find_category_mappings(df, variable, target):
    return df.groupby([variable])[target].mean().to_dict()

def integer_encode(train, test, variable, ordinal_mapping):
    X_train[variable] = X_train[variable].map(ordinal_mapping)
    X_test[variable] = X_test[variable].map(ordinal_mapping)
	
for variable in ['sex', 'embarked']:
    mappings = find_category_mappings(X_train, variable, 'survived')
    integer_encode(X_train, X_test, variable, mappings)

	
-- #With Feature-Engine
from feature_engine.categorical_encoders import MeanCategoricalEncoder
mean_enc = MeanCategoricalEncoder(
    variables=['cabin', 'sex', 'embarked'])
mean_enc.fit(X_train, y_train)
X_train = mean_enc.transform(X_train)
X_test = mean_enc.transform(X_test)

mean_enc.encoder_dict_
mean_enc.variables


					      -
---------Probability Ration Encoding
# Only for binary classification problems
#Replacing categorical labels with this code and method will generate missing values
#for categories present in the test set that were not seen in the training set. 
#Therefore it is extremely important to handle rare labels before-hand