def test_MeanCategoricalEncoder(): # test dataframe df = { 'category': ['A'] * 10 + ['B'] * 6 + ['C'] * 4, 'target': [ 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, ] } df = pd.DataFrame(df) # transformed dataframe transf_df = { 'category': [ 0.200000, 0.200000, 0.200000, 0.200000, 0.200000, 0.200000, 0.200000, 0.200000, 0.200000, 0.200000, 0.333333, 0.333333, 0.333333, 0.333333, 0.333333, 0.333333, 0.500000, 0.500000, 0.500000, 0.500000 ], 'target': [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0] } transf_df = pd.DataFrame(transf_df) encoder = MeanCategoricalEncoder(variables=['category']) encoder.fit(df['category'].to_frame(), df['target']) X = encoder.transform(df['category'].to_frame()) pd.testing.assert_frame_equal(X, transf_df['category'].to_frame()) assert encoder.variables == ['category'] assert encoder.encoder_dict_ == { 'category': { 'A': 0.20000000000000001, 'B': 0.33333333333333331, 'C': 0.5 } } assert encoder.input_shape_ == (20, 1)
def train_models(ModelClass, invoices, observation_end_dates, rfe=False, **kwargs): train_results = dict( models=[], observation_end_dates=observation_end_dates, X_train=[], y_train=[], X_test=[], y_test=[], ) X_trains, y_trains = [], [], for observation_end_date in observation_end_dates: X_train, y_train, X_test, y_test = get_train_test_data( invoices, observation_end_date) X_trains.append(X_train) y_trains.append(y_train) X_train, y_train = pd.concat( X_trains[-lag:]).reset_index(drop=True), pd.concat( y_trains[-lag:]).reset_index(drop=True).astype( int) # Last "lag" months used as training data X_test, y_test = X_test.reset_index(drop=True), y_test.reset_index( drop=True).astype(int) # Encode "MostBoughtItem" feature rare_encoder = RareLabelCategoricalEncoder( tol=0.02 if len(X_train) < 100 else 0.01, variables=['MostBoughtItem']).fit(X_train) X_train = rare_encoder.transform(X_train) X_test = rare_encoder.transform(X_test) mean_enc = MeanCategoricalEncoder(variables=['MostBoughtItem']).fit( X_train, y_train) X_train = mean_enc.transform(X_train) X_test = mean_enc.transform(X_test) if rfe: sel_ = RFE(ModelClass(**kwargs), n_features_to_select=8) sel_.fit(X_train, y_train) selected_feats = X_train.columns[(sel_.get_support())] model = ModelClass(**kwargs).fit(X_train[selected_feats], y_train) train_results['X_train'].append(X_train[selected_feats]) train_results['X_test'].append(X_test[selected_feats]) else: model = ModelClass(**kwargs) model.fit(X_train, y_train) train_results['X_train'].append(X_train) train_results['X_test'].append(X_test) train_results['models'].append(model) train_results['y_train'].append(y_train) train_results['y_test'].append(y_test) return train_results
def test_MeanCategoricalEncoder(dataframe_enc, dataframe_enc_rare, dataframe_enc_na): # test case 1: 1 variable encoder = MeanCategoricalEncoder(variables=['var_A']) encoder.fit(dataframe_enc[['var_A', 'var_B']], dataframe_enc['target']) X = encoder.transform(dataframe_enc[['var_A', 'var_B']]) # transformed dataframe transf_df = dataframe_enc.copy() transf_df['var_A'] = [ 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.5, 0.5, 0.5, 0.5 ] # init params assert encoder.variables == ['var_A'] # fit params assert encoder.encoder_dict_ == { 'var_A': { 'A': 0.3333333333333333, 'B': 0.2, 'C': 0.5 } } assert encoder.input_shape_ == (20, 2) # transform params pd.testing.assert_frame_equal(X, transf_df[['var_A', 'var_B']]) # test case 2: automatically select variables encoder = MeanCategoricalEncoder(variables=None) encoder.fit(dataframe_enc[['var_A', 'var_B']], dataframe_enc['target']) X = encoder.transform(dataframe_enc[['var_A', 'var_B']]) # transformed dataframe transf_df['var_A'] = [ 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.5, 0.5, 0.5, 0.5 ] transf_df['var_B'] = [ 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.5, 0.5, 0.5, 0.5 ] # init params assert encoder.variables == ['var_A', 'var_B'] # fit params assert encoder.encoder_dict_ == { 'var_A': { 'A': 0.3333333333333333, 'B': 0.2, 'C': 0.5 }, 'var_B': { 'A': 0.2, 'B': 0.3333333333333333, 'C': 0.5 } } assert encoder.input_shape_ == (20, 2) # transform params pd.testing.assert_frame_equal(X, transf_df[['var_A', 'var_B']]) # test case 3: raises error if target is not passed with pytest.raises(TypeError): encoder = MeanCategoricalEncoder() encoder.fit(dataframe_enc) # test case 4: when dataset to be transformed contains categories not present in training dataset with pytest.warns(UserWarning): encoder = MeanCategoricalEncoder() encoder.fit(dataframe_enc[['var_A', 'var_B']], dataframe_enc['target']) encoder.transform(dataframe_enc_rare[['var_A', 'var_B']]) # test case 4: when dataset contains na, fit method with pytest.raises(ValueError): encoder = MeanCategoricalEncoder() encoder.fit(dataframe_enc_na[['var_A', 'var_B']], dataframe_enc_na['target']) # test case 4: when dataset contains na, transform method with pytest.raises(ValueError): encoder = MeanCategoricalEncoder() encoder.fit(dataframe_enc[['var_A', 'var_B']], dataframe_enc['target']) encoder.transform(dataframe_enc_na) with pytest.raises(NotFittedError): imputer = OrdinalCategoricalEncoder() imputer.transform(dataframe_enc)
password=password) homes = pd.read_sql('SELECT * FROM Homes;', con=db_connection) homes.drop(['ParcelNumber', 'Address'], axis=1, inplace=True) #Save the first dataframe that we will need for the application homes.to_csv('Data/houses.csv', index=False) #Create a copy of the homes dataframe neighborhoods = homes.copy() #We are going to create another column that stores the mean target value for each neighborhood #First we need to save the neighborhood name into another column neighborhoods['neighborhood_name'] = neighborhoods['Neighborhood'] mean_enc = MeanCategoricalEncoder(variables=['Neighborhood']) #Fit the encoder mean_enc.fit(neighborhoods, neighborhoods['SalePrice']) #Transform the neighborhoods dataframe neighborhoods = mean_enc.transform(neighborhoods) #Load a dataframe that has the geocoordinates of each neighborhood hoods = gpd.read_file('Los Angeles Neighborhood Map.geojson') hoods = hoods.rename(columns={'name': 'neighborhood_name'}) hoods[['longitude', 'latitude']] = hoods[['longitude', 'latitude']].astype(float) #Create a dataframe that averages out every attribute by neighborhood avg = neighborhoods.groupby('neighborhood_name')[[
def find_category_mappings(df, variable, target): return df.groupby([variable])[target].mean().to_dict() def integer_encode(train, test, variable, ordinal_mapping): X_train[variable] = X_train[variable].map(ordinal_mapping) X_test[variable] = X_test[variable].map(ordinal_mapping) for variable in ['sex', 'embarked']: mappings = find_category_mappings(X_train, variable, 'survived') integer_encode(X_train, X_test, variable, mappings) -- #With Feature-Engine from feature_engine.categorical_encoders import MeanCategoricalEncoder mean_enc = MeanCategoricalEncoder( variables=['cabin', 'sex', 'embarked']) mean_enc.fit(X_train, y_train) X_train = mean_enc.transform(X_train) X_test = mean_enc.transform(X_test) mean_enc.encoder_dict_ mean_enc.variables - ---------Probability Ration Encoding # Only for binary classification problems #Replacing categorical labels with this code and method will generate missing values #for categories present in the test set that were not seen in the training set. #Therefore it is extremely important to handle rare labels before-hand