def ordinal_encoding_dict_lookup(X_in, mapping=None, cols=None, handle_unknown='value', handle_missing='value'): """ Normal ordinal encoding is an O(N) implementation and does not use a dict for faster lookup. This aims at providing an improved version with dict lookup This implementation does not support nan in the data """ X = X_in.copy(deep=True) if mapping is not None: mapping_out = mapping for switch in mapping: column = switch.get('col') col_mapping = switch['mapping'] X[column] = [col_mapping.get(x, -1) for x in X[column]] else: if cols is None: cols = X.columns.values mapping_out = [] for col in cols: nan_identity = np.nan if util.is_category(X[col].dtype): categories = X[col].cat.categories.tolist() if X[col].isna().any(): categories += [np.nan] else: categories = X[col].unique() index = pd.Series(categories).fillna(nan_identity).unique() data = pd.Series(index=index, data=range(1, len(index) + 1)) if handle_missing == 'value' and ~data.index.isnull().any(): data.loc[nan_identity] = -2 elif handle_missing == 'return_nan': data.loc[nan_identity] = -2 data = data.to_dict() mapping_out.append( { 'col': col, 'mapping': data, 'data_type': X[col].dtype }, ) return X, mapping_out
def ordinal_encoding(X_in, mapping=None, cols=None, impute_missing=True, handle_unknown='impute'): """ Ordinal encoding uses a single column of integers to represent the classes. An optional mapping dict can be passed in, in this case we use the knowledge that there is some true order to the classes themselves. Otherwise, the classes are assumed to have no true order and integers are selected at random. """ X = X_in.copy(deep=True) if cols is None: cols = X.columns.values if mapping is not None: mapping_out = mapping for switch in mapping: categories_dict = dict(switch.get('mapping')) column = switch.get('col') transformed_column = X[column].map(lambda x: categories_dict.get(x, np.nan)) try: transformed_column = transformed_column.astype(int) except ValueError as e: transformed_column = transformed_column.astype(float) if impute_missing: if handle_unknown == 'impute': transformed_column.fillna(0, inplace=True) elif handle_unknown == 'error': missing = transformed_column.isnull() if any(missing): raise ValueError('Unexpected categories found in column %s' % column) X[column] = transformed_column else: mapping_out = [] for col in cols: if util.is_category(X[col].dtype): categories = X[col].cat.categories else: categories = [x for x in pd.unique(X[col].values) if x is not None] categories_dict = {x: i + 1 for i, x in enumerate(categories)} mapping_out.append({'col': col, 'mapping': list(categories_dict.items()), 'data_type': X[col].dtype}, ) return X, mapping_out
def ordinal_encoding(X_in, mapping=None, cols=None, handle_unknown='value', handle_missing='value'): """ Ordinal encoding uses a single column of integers to represent the classes. An optional mapping dict can be passed in, in this case we use the knowledge that there is some true order to the classes themselves. Otherwise, the classes are assumed to have no true order and integers are selected at random. """ return_nan_series = pd.Series(data=[np.nan], index=[-2]) X = X_in.copy(deep=True) if cols is None: cols = X.columns.values if mapping is not None: mapping_out = mapping for switch in mapping: column = switch.get('col') X[column] = X[column].map(switch['mapping']) try: X[column] = X[column].astype(int) except ValueError as e: X[column] = X[column].astype(float) if handle_unknown == 'value': X[column].fillna(-1, inplace=True) elif handle_unknown == 'error': missing = X[column].isnull() if any(missing): raise ValueError( 'Unexpected categories found in column %s' % column) if handle_missing == 'return_nan': X[column] = X[column].map(return_nan_series).where( X[column] == -2, X[column]) else: mapping_out = [] for col in cols: nan_identity = np.nan if util.is_category(X[col].dtype): categories = X[col].cat.categories else: categories = X[col].unique() index = pd.Series(categories).fillna(nan_identity).unique() data = pd.Series(index=index, data=range(1, len(index) + 1)) if handle_missing == 'value' and ~data.index.isnull().any(): data.loc[nan_identity] = -2 elif handle_missing == 'return_nan': data.loc[nan_identity] = -2 mapping_out.append( { 'col': col, 'mapping': data, 'data_type': X[col].dtype }, ) return X, mapping_out
def ordinal_encoding(X_in, mapping=None, cols=None, impute_missing=True, handle_unknown='impute'): """ Ordinal encoding uses a single column of integers to represent the classes. An optional mapping dict can be passed in, in this case we use the knowledge that there is some true order to the classes themselves. Otherwise, the classes are assumed to have no true order and integers are selected at random. """ X = X_in.copy(deep=True) if cols is None: cols = X.columns.values if mapping is not None: mapping_out = mapping for switch in mapping: column = switch.get('col') X[column] = X[column].map(switch['mapping']) try: X[column] = X[column].astype(int) except ValueError as e: X[column] = X[column].astype(float) if impute_missing: if handle_unknown == 'impute': X[column].fillna(0, inplace=True) elif handle_unknown == 'error': missing = X[column].isnull() if any(missing): raise ValueError( 'Unexpected categories found in column %s' % column) else: mapping_out = [] for col in cols: if util.is_category(X[col].dtype): categories = X[col].cat.categories else: categories = [ x for x in pd.unique(X[col].values) if x is not None ] index = [] values = [] for i in range(len(categories)): index.append(categories[i]) values.append(i + 1) mapping = pd.Series(data=values, index=index) mapping_out.append( { 'col': col, 'mapping': mapping, 'data_type': X[col].dtype }, ) return X, mapping_out