an open-ended question, on which one-hot encoding does not work well.
The other columns are more traditional categorical or numerical
variables.

Let's see how different encoding for the dirty column impact on the
score of a classification problem.

"""

################################################################################
# Loading the data
# ----------------
from dirty_cat.datasets import fetch_midwest_survey
import pandas as pd

dataset = fetch_midwest_survey()
df = pd.read_csv(dataset['path']).astype(str)

################################################################################
# The challenge with this data is that it contains a free-form input
# column, where people put whatever they want:
dirty_column = 'In your own words, what would you call the part of the country you live in now?'
print(df[dirty_column].value_counts()[-10:])

################################################################################
# Separating clean, and dirty columns as well a a column we will try to predict
# ------------------------------------------------------------------------------

target_column = 'Location (Census Region)'
clean_columns = [
    'Personally identification as a Midwesterner?',
示例#2
0
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

from dirty_cat import datasets
from dirty_cat import SimilarityEncoder

# encoding methods
encoder_dict = {
    'one-hot': OneHotEncoder(handle_unknown='ignore'),
    'similarity': SimilarityEncoder(similarity='ngram',
                                    handle_unknown='ignore'),
    'num': FunctionTransformer(None)
}

data_file = datasets.fetch_midwest_survey()

for method in ['one-hot', 'similarity']:
    # Load the data
    df = pd.read_csv(data_file).astype(str)

    target_column = 'Location (Census Region)'
    y = df[target_column].values.ravel()

    # Transform the data into a numerical matrix
    encoder_type = {
        'one-hot': [
            'Personally identification as a Midwesterner?', 'Illinois in MW?',
            'Indiana in MW?', 'Kansas in MW?', 'Iowa in MW?',
            'Michigan in MW?', 'Minnesota in MW?', 'Missouri in MW?',
            'Nebraska in MW?', 'North Dakota in MW?', 'Ohio in MW?',