def profile_encoder(encoder, hashing='fast', minmax_hash=False): # not an unit test from dirty_cat.datasets import fetch_employee_salaries employee_salaries = fetch_employee_salaries() df = employee_salaries.X X = df[["employee_position_title"]] t0 = time.time() enc = encoder(n_components=50, hashing=hashing, minmax_hash=minmax_hash) enc.fit(X) y = enc.transform(X) assert y.shape == (len(X), 50) eta = time.time() - t0 return eta
def profile_encoder(Encoder, init): # not an unit test from dirty_cat import datasets employee_salaries = datasets.fetch_employee_salaries() data = employee_salaries['data'] X = data['employee_position_title'].tolist() t0 = time.time() encoder = Encoder(n_components=50, init=init) encoder.fit(X) y = encoder.transform(X) assert y.shape == (len(X), 50) eta = time.time() - t0 return eta
def profile_encoder(Encoder, init): # not an unit test from dirty_cat.datasets import fetch_employee_salaries info = fetch_employee_salaries() data = pd.read_csv(info['path'], **info['read_csv_kwargs']) X = np.array(data['employee_position_title'])[:, None] t0 = time.time() encoder = Encoder(n_components=50, init=init) encoder.fit(X) y = encoder.transform(X) assert y.shape == (len(X), 50) eta = time.time() - t0 return eta
def profile_encoder(Encoder, hashing='fast', minmax_hash=False): # not an unit test from dirty_cat import datasets import pandas as pd employee_salaries = datasets.fetch_employee_salaries() data = pd.read_csv(employee_salaries['path']) X = data['Employee Position Title'].tolist() X = X * 10 t0 = time.time() encoder = Encoder(n_components=50, hashing=hashing, minmax_hash=minmax_hash) encoder.fit(X) y = encoder.transform(X) assert y.shape == (len(X), 50) eta = time.time() - t0 return eta
from sklearn.model_selection import KFold from dirty_cat import datasets from dirty_cat import SimilarityEncoder, TargetEncoder # encoding methods encoder_dict = { 'one-hot': OneHotEncoder(handle_unknown='ignore'), 'similarity': SimilarityEncoder(similarity='ngram', handle_unknown='ignore'), 'target': TargetEncoder(handle_unknown='ignore'), 'num': FunctionTransformer(None) } data_file = datasets.fetch_employee_salaries() for method in ['one-hot', 'target', 'similarity']: # Load the data df = pd.read_csv(data_file).astype(str) df['Current Annual Salary'] = [float(s[1:]) for s in df['Current Annual Salary']] df['Year First Hired'] = [int(s.split('/')[-1]) for s in df['Date First Hired']] target_column = 'Current Annual Salary' y = df[target_column].values.ravel() # Transform the data into a numerical matrix encoder_type = { 'one-hot': ['Gender', 'Department Name', 'Assignment Category'],
:class:`~dirty_cat.GapEncoder` .. |SE| replace:: :class:`~dirty_cat.SimilarityEncoder` .. |permutation importances| replace:: :func:`~sklearn.inspection.permutation_importance` """ # %% # # The data # ======== # # We first retrieve the dataset: from dirty_cat.datasets import fetch_employee_salaries employee_salaries = fetch_employee_salaries() # %% # X, the input data (descriptions of employees): X = employee_salaries.X X # %% # and y, our target column (the annual salary) y = employee_salaries.y y.name # %% # Now, let's carry out some basic preprocessing: import pandas as pd X['date_first_hired'] = pd.to_datetime(X['date_first_hired'])
Investigating dirty categories ================================= What are dirty categorical variables and how can a good encoding help with statistical learning. """ ######################################################################### # What do we mean by dirty categories? # ------------------------------------------------- # # Let's look at a dataset called employee salaries: import pandas as pd from dirty_cat import datasets employee_salaries = datasets.fetch_employee_salaries() print(employee_salaries['description']) data = pd.read_csv(employee_salaries['path']) print(data.head(n=5)) ######################################################################### # Here is how many unique entries there is per column print(data.nunique()) ######################################################################### # As we can see, some entries have many different unique values: print(data['Employee Position Title'].value_counts().sort_index()) ######################################################################### # These different entries are often variations on the same entities: # there are 3 kinds of Accountant/Auditor.