an open-ended question, on which one-hot encoding does not work well. The other columns are more traditional categorical or numerical variables. Let's see how different encoding for the dirty column impact on the score of a classification problem. """ ################################################################################ # Loading the data # ---------------- from dirty_cat.datasets import fetch_midwest_survey import pandas as pd dataset = fetch_midwest_survey() df = pd.read_csv(dataset['path']).astype(str) ################################################################################ # The challenge with this data is that it contains a free-form input # column, where people put whatever they want: dirty_column = 'In your own words, what would you call the part of the country you live in now?' print(df[dirty_column].value_counts()[-10:]) ################################################################################ # Separating clean, and dirty columns as well a a column we will try to predict # ------------------------------------------------------------------------------ target_column = 'Location (Census Region)' clean_columns = [ 'Personally identification as a Midwesterner?',
from sklearn.preprocessing import OneHotEncoder from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import StratifiedKFold from dirty_cat import datasets from dirty_cat import SimilarityEncoder # encoding methods encoder_dict = { 'one-hot': OneHotEncoder(handle_unknown='ignore'), 'similarity': SimilarityEncoder(similarity='ngram', handle_unknown='ignore'), 'num': FunctionTransformer(None) } data_file = datasets.fetch_midwest_survey() for method in ['one-hot', 'similarity']: # Load the data df = pd.read_csv(data_file).astype(str) target_column = 'Location (Census Region)' y = df[target_column].values.ravel() # Transform the data into a numerical matrix encoder_type = { 'one-hot': [ 'Personally identification as a Midwesterner?', 'Illinois in MW?', 'Indiana in MW?', 'Kansas in MW?', 'Iowa in MW?', 'Michigan in MW?', 'Minnesota in MW?', 'Missouri in MW?', 'Nebraska in MW?', 'North Dakota in MW?', 'Ohio in MW?',