def fit(self, X_df, y=None): def regroup_cat(X, liste): if X not in liste: return ('other') else: return (X) self.prop_to_keep = [ 'Apartment', 'Serviced apartment', 'Condominium', 'Loft' ] self.prop_transformer = TargetEncoder() self.prop_transformer.fit( X_df['property_type'].apply( lambda x: regroup_cat(x, self.prop_to_keep)), y) self.pol_to_keep = [ 'flexible', 'strict_14_with_grace_period', 'moderate', 'moderate_new' ] self.pol_transformer = TargetEncoder() self.pol_transformer.fit( X_df['cancellation_policy'].apply( lambda x: regroup_cat(x, self.pol_to_keep)), y) self.room_transformer = OrdinalEncoder() self.room_transformer.fit(X_df['room_type']) self.city_transformer = OneHotEncoder(handle_unknown='ignore') self.city_transformer.fit(pd.DataFrame(X_df['city_origin'])) # numeric_transformer = Pipeline(steps = [('impute', SimpleImputer(strategy='median'))]) return self
def fit(self, X, y=None, **kwargs): """ :param X: :param y: :param kwargs: :return: """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose, cols=self.cols) self.ordinal_encoder = self.ordinal_encoder.fit(X) # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [ x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5 ] return self
class GoogleAiLabelEncoder(BaseTransformer): def __init__(self, colname): self.colname = colname self.encoder = OrdinalEncoder() def fit(self, annotations, **kwargs): self.encoder.fit(annotations[self.colname].values) return self def transform(self, annotations, annotations_human_labels, **kwargs): if annotations is not None: annotations[self.colname] = self.encoder.transform( annotations[self.colname].values) annotations_human_labels[self.colname] = self.encoder.transform( annotations_human_labels[self.colname].values) return { 'annotations': annotations, 'annotations_human_labels': annotations_human_labels } else: return {'mapping': self.encoder.category_mapping[0]['mapping']} def load(self, filepath): self.encoder = joblib.load(filepath) return self def persist(self, filepath): joblib.dump(self.encoder, filepath)
class HelmertEncoder(BaseEstimator, TransformerMixin): """ """ def __init__(self, verbose=0, cols=None, drop_invariant=False): """ :param verbose: :param cols: :return: """ self.drop_invariant = drop_invariant self.drop_cols = [] self.verbose = verbose self.cols = cols self.ordinal_encoder = OrdinalEncoder(verbose=verbose, cols=cols) def fit(self, X, y=None, **kwargs): """ :param X: :param y: :param kwargs: :return: """ self.ordinal_encoder = self.ordinal_encoder.fit(X) if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [ x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5 ] return self def transform(self, X): """ :param X: :return: """ if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) if self.cols == []: return X X = self.ordinal_encoder.transform(X) X = helmert_coding(X, cols=self.cols) if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) return X
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown) self.ordinal_encoder = self.ordinal_encoder.fit(X) ordinal_mapping = self.ordinal_encoder.category_mapping mappings_out = [] for switch in ordinal_mapping: values = [x[1] for x in switch.get('mapping')] column_mapping = self.fit_helmert_coding(values) mappings_out.append({ 'col': switch.get('col'), 'mapping': column_mapping, }) self.mapping = mappings_out if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) generated_cols = get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] return self
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = util.convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, handle_unknown='value', handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) self.mapping = self.generate_mapping() X_temp = self.transform(X, override_return_df=True) self.feature_names = list(X_temp.columns) if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self
def fit(self, X, y=None, **kwargs): """Fits an ordinal encoder to produce a consistent mapping across applications and optionally finds generally invariant columns to drop consistently. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) # first check the type X = convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown ) self.ordinal_encoder = self.ordinal_encoder.fit(X) ordinal_mapping = self.ordinal_encoder.category_mapping mappings_out = [] for switch in ordinal_mapping: values = [x[1] for x in switch.get('mapping')] column_mapping = self.fit_backward_difference_coding(values) mappings_out.append({'col': switch.get('col'), 'mapping': column_mapping, }) self.mapping = mappings_out # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) generated_cols = get_generated_cols(X, X_temp, self.cols) self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5] return self
class HelmertEncoder(BaseEstimator, TransformerMixin): """ """ def __init__(self, verbose=0, cols=None, drop_invariant=False): """ :param verbose: :param cols: :return: """ self.drop_invariant = drop_invariant self.drop_cols = [] self.verbose = verbose self.cols = cols self.ordinal_encoder = OrdinalEncoder(verbose=verbose, cols=cols) def fit(self, X, y=None, **kwargs): """ :param X: :param y: :param kwargs: :return: """ self.ordinal_encoder = self.ordinal_encoder.fit(X) if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5] return self def transform(self, X): """ :param X: :return: """ if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) if self.cols == []: return X X = self.ordinal_encoder.transform(X) X = helmert_coding(X, cols=self.cols) if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) return X
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) # first check the type X = util.convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown) self.ordinal_encoder = self.ordinal_encoder.fit(X) for col in self.cols: self.digits_per_col[col] = self.calc_required_digits(X, col) # do a transform on the training data to get a column list X_t = self.transform(X, override_return_df=True) self._encoded_columns = X_t.columns.values # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] return self
def __init__(self, cat_cols=None, drop_original: bool = False, encoder=OrdinalEncoder()): """ Categorical transformer. This is a wrapper for categorical encoders. :param cat_cols: :param drop_original: :param encoder: """ self.cat_cols = cat_cols self.drop_original = drop_original self.encoder = encoder self.default_encoder = OrdinalEncoder()
class CategoricalTransformer(BaseEstimator, TransformerMixin): def __init__(self, cat_cols=None, drop_original: bool = False, encoder=OrdinalEncoder()): """ Categorical transformer. This is a wrapper for categorical encoders. :param cat_cols: :param drop_original: :param encoder: """ self.cat_cols = cat_cols self.drop_original = drop_original self.encoder = encoder self.default_encoder = OrdinalEncoder() def fit(self, X, y=None): if self.cat_cols is None: kinds = np.array([dt.kind for dt in X.dtypes]) is_cat = kinds == 'O' self.cat_cols = list(X.columns[is_cat]) self.encoder.set_params(cols=self.cat_cols) self.default_encoder.set_params(cols=self.cat_cols) self.encoder.fit(X[self.cat_cols], y) self.default_encoder.fit(X[self.cat_cols], y) return self def transform(self, X, y=None): data = copy.deepcopy(X) new_cat_names = [f'{col}_encoded' for col in self.cat_cols] encoded_data = self.encoder.transform(data[self.cat_cols]) if encoded_data.shape[1] == len(self.cat_cols): data[new_cat_names] = encoded_data else: pass if self.drop_original: data = data.drop(self.cat_cols, axis=1) else: data[self.cat_cols] = self.default_encoder.transform( data[self.cat_cols]) return data def fit_transform(self, X, y=None, **fit_params): data = copy.deepcopy(X) self.fit(data) return self.transform(data)
def __init__(self, verbose=0, cols=None, drop_invariant=False): """ :param verbose: :param cols: :return: """ self.drop_invariant = drop_invariant self.drop_cols = [] self.verbose = verbose self.cols = cols self.ordinal_encoder = OrdinalEncoder(verbose=verbose, cols=cols)
def ordinal_encode(self,data): """ 複数のカテゴリ変数をベクトル化して、それぞれ変換規則を保存する関数です。 ベクトル化したデータセットを返します。 変換規則はenc_dictに保存されています。 :param data: 学習で用いるデータセット(Dataset型の属性dataを受け取る) """ org_order=data.columns #self.enc_dict={} self.model=OrdinalEncoder(cols=self.columns,handle_unknown="inpute") oe_data=self.model.fit_transform(data) oe_data=oe_data.ix[:,org_order] return oe_data
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = util.convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown) self.ordinal_encoder = self.ordinal_encoder.fit(X) self.mapping = self.generate_mapping() if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] return self
def fit(self, X, y, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # unite the input into pandas types X = util.convert_input(X) y = util.convert_input_vector(y, X.index) if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, handle_unknown='value', handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) X_ordinal = self.ordinal_encoder.transform(X) self.mapping = self.fit_target_encoding(X_ordinal, y) X_temp = self.transform(X, override_return_df=True) self.feature_names = list(X_temp.columns) if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self
def fit(self, X, y=None, **kwargs): """ :param X: :param y: :param kwargs: :return: """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose, cols=self.cols) self.ordinal_encoder = self.ordinal_encoder.fit(X) # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5] return self
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) # first check the type X = convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown) self.ordinal_encoder = self.ordinal_encoder.fit(X) # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [ x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5 ] return self
def fit(self, X, y=None, **kwargs): self._dim = X.shape[1] if self.cols is None: self.cols = get_obj_cols(X) self.ordinal_encoder = OrdinalEncoder(cols=self.cols, handle_unknown='value', handle_missing='value') self.ordinal_encoder = self.ordinal_encoder.fit(X) self.mapping = self.generate_mapping() X_temp = self.transform(X, override_return_df=True) self.feature_names = list(X_temp.columns) return self
def create_regression_pipeline(X, y): X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7) numerical_indexes = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]) non_numerical_indexes = np.array([], int) one_hot_indexes_after_handle_missing_values = np.array([], int) ordinal_indexes_after_handle_missing_values = np.array([], int) pipeline = Pipeline(steps=[ ( "handle_missing_values", ColumnTransformer( [ ("imputer_mean", SimpleImputer(strategy="mean"), numerical_indexes), ( "imputer_mode", SimpleImputer(strategy="most_frequent"), non_numerical_indexes, ), ], remainder="drop", ), ), ( "handle categorical features", ColumnTransformer( [ ( "feature_encoder_ordinal", OrdinalEncoder(), ordinal_indexes_after_handle_missing_values, ), ( "feature_encoder_onehot", OneHotEncoder(), one_hot_indexes_after_handle_missing_values, ), ], remainder="passthrough", ), ), ("estimator", LinearRegression(fit_intercept=True)), ]) pipeline.fit(X_train, y_train) y_pred = pipeline.predict(X_test) return { 'features_train': X_train, 'features_test': X_test, 'target_train': y_train, 'target_test': y_test, 'target_predicted': y_pred, 'regression_pipeline': pipeline }
class BinaryEncoder(BaseEstimator, TransformerMixin): """ Binary encoding encodes the integers as binary code with one column per digit. """ def __init__(self, verbose=0, cols=None): """ :param verbose: :param cols: :return: """ self.verbose = verbose self.cols = cols self.ordinal_encoder = OrdinalEncoder(verbose=verbose, cols=cols) def fit(self, X, y=None, **kwargs): """ :param X: :param y: :param kwargs: :return: """ self.ordinal_encoder = self.ordinal_encoder.fit(X) return self def transform(self, X): """ :param X: :return: """ if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) X = self.ordinal_encoder.transform(X) return binary(X, cols=self.cols)
def __init__(self, verbose=0, cols=None): """ :param verbose: :param cols: :return: """ self.verbose = verbose self.cols = cols self.ordinal_encoder = OrdinalEncoder(verbose=verbose, cols=cols)
class BackwardDifferenceEncoder(BaseEstimator, TransformerMixin): """ """ def __init__(self, verbose=0, cols=None): """ :param verbose: :param cols: :return: """ self.verbose = verbose self.cols = cols self.ordinal_encoder = OrdinalEncoder(verbose=verbose, cols=cols) def fit(self, X, y=None, **kwargs): """ :param X: :param y: :param kwargs: :return: """ self.ordinal_encoder = self.ordinal_encoder.fit(X) return self def transform(self, X): """ :param X: :return: """ if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) X = self.ordinal_encoder.transform(X) return backward_difference_coding(X, cols=self.cols)
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose, cols=self.cols) self.ordinal_encoder = self.ordinal_encoder.fit(X) if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [ x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5 ] return self
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) # first check the type X = convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown ) self.ordinal_encoder = self.ordinal_encoder.fit(X) for col in self.cols: self.digits_per_col[col] = self.calc_required_digits(X, col) # do a transform on the training data to get a column list X_t = self.transform(X, override_return_df=True) self._encoded_columns = X_t.columns.values # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5] return self
def fit(self, X, y=None, **kwargs): """ :param X: :param y: :param kwargs: :return: """ # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose, cols=self.cols) self.ordinal_encoder = self.ordinal_encoder.fit(X) if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5] return self
def ordinal_encoder(self, df, configger): """ :param df: the train dataset. :param configger: the json str of configger setting, the params means: verbose: int integer indicating verbosity of the output. 0 for none. cols: list a list of columns to encode, if None, all string columns will be encoded. drop_invariant: bool boolean for whether or not to drop columns with 0 variance. return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). mapping: list of dict a mapping of class to label to use for the encoding, optional. the dict contains the keys 'col' and 'mapping'. the value of 'col' should be the feature name. the value of 'mapping' should be a dictionary of 'original_label' to 'encoded_label'. example mapping: [{'col': 'col1', 'mapping': {None: 0, 'a': 1, 'b': 2}}] handle_unknown: str options are 'error', 'return_nan' and 'value', defaults to 'value', which will impute the category -1. handle_missing: str options are 'error', 'return_nan', and 'value, default to 'value', which treat nan as a category at fit time, or -2 at transform time if nan is not a category during fit. :return: the transform result """ X, y, encode_col = self.get_Xy(df, configger) drop_invariant = set_default_vale("drop_invariant", configger, False, is_bool=True) handle_missing = set_default_vale("handle_missing", configger, "value") handle_unknown = set_default_vale("handle_unknown", configger, "value") encoder = OrdinalEncoder(verbose=1, cols=encode_col, drop_invariant=drop_invariant, return_df=True, handle_unknown=handle_unknown, handle_missing=handle_missing) res = encoder.fit_transform(X, y) return res
def EncodeCategoricalData(train_df, test_df): """encode data with OrdinalEncoder Parameters ---------- train_df: dataframe training dataframe object to fit and transform test_df: dataframe test dataframe object to transform Returns ------- transformed training and test dataframe """ # column list to ordinal encode ordinal_encode_cols = ["country", "province", "region_1", "taster_name", "variety"] # create ordinal encode object # object assigns -1 to the first-time-seen values of the test set ordinal_encoder = OrdinalEncoder(cols=ordinal_encode_cols, return_df=True, handle_unknown="value", handle_missing="return_nan") # fit object on the train dataset ordinal_encoder.fit(train_df) # transform train and test datasets ord_encoded_train = (ordinal_encoder .transform(train_df)) ord_encoded_test = (ordinal_encoder .transform(test_df)) return ord_encoded_train, ord_encoded_test
def fetch_TICTACTOE(path, valid_size=0.2, test_size=0.2, seed=None): path = Path(path) data_path = path / 'tic-tac-toe.data' if not data_path.exists(): path.mkdir(parents=True, exist_ok=True) download( 'https://archive.ics.uci.edu/ml/machine-learning-databases/tic-tac-toe/tic-tac-toe.data', data_path) data = pd.read_csv(data_path, names=np.arange(10)) encoder = OrdinalEncoder(return_df=False) data = encoder.fit_transform(data) X, Y = (data[:, :-1]).astype(np.float32), (data[:, -1] - 1).astype(int) X_train, X_test, y_train, y_test = train_test_split(X, Y, stratify=Y, test_size=test_size, random_state=seed) X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train, test_size=valid_size / (1 - test_size), random_state=seed) return dict(X_train=X_train, y_train=y_train, X_valid=X_val, y_valid=y_val, X_test=X_test, y_test=y_test)
def fit(self, X, y=None, **kwargs): """Fits an ordinal encoder to produce a consistent mapping across applications and optionally finds generally invariant columns to drop consistently. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) # first check the type if not isinstance(X, pd.DataFrame): if isinstance(X, list): X = pd.DataFrame(np.array(X)) elif isinstance(X, (np.generic, np.ndarray)): X = pd.DataFrame(X) else: raise ValueError('Unexpected input type: %s' % (str(type(X)))) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose, cols=self.cols) self.ordinal_encoder = self.ordinal_encoder.fit(X) # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5] return self
def get_single_encoder(encoder_name: str, cat_cols: list): """ Get encoder by its name :param encoder_name: Name of desired encoder :param cat_cols: Cat columns for encoding :return: Categorical encoder """ if encoder_name == "FrequencyEncoder": encoder = FrequencyEncoder(cols=cat_cols) if encoder_name == "WOEEncoder": encoder = WOEEncoder(cols=cat_cols) if encoder_name == "TargetEncoder": encoder = TargetEncoder(cols=cat_cols) if encoder_name == "SumEncoder": encoder = SumEncoder(cols=cat_cols) if encoder_name == "MEstimateEncoder": encoder = MEstimateEncoder(cols=cat_cols) if encoder_name == "LeaveOneOutEncoder": encoder = LeaveOneOutEncoder(cols=cat_cols) if encoder_name == "HelmertEncoder": encoder = HelmertEncoder(cols=cat_cols) if encoder_name == "BackwardDifferenceEncoder": encoder = BackwardDifferenceEncoder(cols=cat_cols) if encoder_name == "JamesSteinEncoder": encoder = JamesSteinEncoder(cols=cat_cols) if encoder_name == "OrdinalEncoder": encoder = OrdinalEncoder(cols=cat_cols) if encoder_name == "CatBoostEncoder": encoder = CatBoostEncoder(cols=cat_cols) if encoder_name == "MEstimateEncoder": encoder = MEstimateEncoder(cols=cat_cols) if encoder_name == "OneHotEncoder": encoder = OneHotEncoder(cols=cat_cols) if encoder is None: raise NotImplementedError("To be implemented") return encoder
def create_clustering_pipeline(X): numerical_indexes = np.array([0, 1, 2, 3]) non_numerical_indexes = np.array([], int) non_numerical_indexes_after_handle_missing_values = np.array([], int) pipeline = Pipeline(steps=[ ( "handle_missing_values", ColumnTransformer( [ ("imputer_mean", SimpleImputer(strategy="mean"), numerical_indexes), ( "imputer_mode", SimpleImputer(strategy="most_frequent"), non_numerical_indexes, ), ], remainder="drop", ), ), ( "handle_categorical_features", ColumnTransformer( [( "feature_encoder", OrdinalEncoder(), non_numerical_indexes_after_handle_missing_values, )], remainder="passthrough", ), ), ( "estimator", KMeans(n_clusters=3, n_init=10, max_iter=300), ), ]) _ = pipeline.fit_transform(X) clusters = pipeline.named_steps.estimator.labels_ return {'clusters': clusters, 'clustering_pipeline': pipeline}
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type if not isinstance(X, pd.DataFrame): if isinstance(X, list): X = pd.DataFrame(np.array(X)) elif isinstance(X, (np.generic, np.ndarray)): X = pd.DataFrame(X) else: raise ValueError('Unexpected input type: %s' % (str(type(X)))) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose, cols=self.cols) self.ordinal_encoder = self.ordinal_encoder.fit(X) if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5] return self
def cross_validate_forest(X_train, y_train, pipes, grids, kfolds, random_search=False): """Cross-validate RandomForestClassifier pipeline.""" pipes['forest'] = make_pipeline(CategoricalToString(), SimpleDataFrameImputer(median_cols=['Age', 'Fare'], mode_cols=['Embarked']), OrdinalEncoder(cols=['Title', 'Deck', 'Embarked'], handle_unknown='impute'), RandomForestClassifier(**{'bootstrap': True, 'max_depth': 70, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 64, 'random_state': RANDOM_SEED})) if random_search: n_estimators = [int(x) for x in np.linspace(start=10, stop=500, num=10)] max_features = ['auto', 'sqrt'] max_depth = [int(x) for x in np.linspace(10, 110, num=11)] max_depth.append(None) min_samples_split = [2, 5, 10] min_samples_leaf = [1, 2, 4] bootstrap = [True, False] random_grid = {'randomforestclassifier__n_estimators': n_estimators, 'randomforestclassifier__max_features': max_features, 'randomforestclassifier__max_depth': max_depth, 'randomforestclassifier__min_samples_split': min_samples_split, 'randomforestclassifier__min_samples_leaf': min_samples_leaf, 'randomforestclassifier__bootstrap': bootstrap} pprint.pprint(random_grid) randsearch = RandomizedSearchCV(pipes['forest'], random_grid, n_iter=50, cv=3, verbose=0, random_state=42) start = time.time() randsearch.fit(X_train, y_train) finish = time.time() print('randsearch.fit execution time:', finish - start) pprint.pprint(randsearch.best_params_) forest = ExtendedClassifier.cross_validate(pipes['forest'], X_train, y_train, sklearn_cvs_kws={'cv': kfolds}, param_strategy='init', logdir_path=r'logs/models/forest', serialize_to=r'models/forest.pickle') return forest
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown ) self.ordinal_encoder = self.ordinal_encoder.fit(X) if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5] return self
def get_single_encoder(encoder_name: str, cat_cols: list): if encoder_name == "FrequencyEncoder": encoder = FrequencyEncoder(cols=cat_cols) if encoder_name == "WOEEncoder": encoder = WOEEncoder(cols=cat_cols) if encoder_name == "TargetEncoder": encoder = TargetEncoder(cols=cat_cols) if encoder_name == "SumEncoder": encoder = SumEncoder(cols=cat_cols) if encoder_name == "MEstimateEncoder": encoder = MEstimateEncoder(cols=cat_cols) if encoder_name == "LeaveOneOutEncoder": encoder = LeaveOneOutEncoder(cols=cat_cols) if encoder_name == "HelmertEncoder": encoder = HelmertEncoder(cols=cat_cols) if encoder_name == "BackwardDifferenceEncoder": encoder = BackwardDifferenceEncoder(cols=cat_cols) if encoder_name == "JamesSteinEncoder": encoder = JamesSteinEncoder(cols=cat_cols) if encoder_name == "OrdinalEncoder": encoder = OrdinalEncoder(cols=cat_cols) if encoder_name == "CatBoostEncoder": encoder = CatBoostEncoder(cols=cat_cols) if encoder_name == "MEstimateEncoder": encoder = MEstimateEncoder(cols=cat_cols) if encoder_name == 'OneHotEncoder': encoder = OneHotEncoder(cols=cat_cols) # assert encoder is not None return encoder
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) # first check the type X = convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose, cols=self.cols) self.ordinal_encoder = self.ordinal_encoder.fit(X) # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5] return self
class SumEncoder(BaseEstimator, TransformerMixin): """ """ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True): """ :param verbose: (optional, default=0) integer indicating verbosity of output. 0 for none. :param cols: (optional, default=None) a list of columns to encode, if None, all string columns will be encoded :param drop_invariant: (optional, default=False) boolean for whether or not to drop columns with 0 variance :param return_df: (optional, default=True) boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array) :return: """ self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] self.verbose = verbose self.cols = cols self.ordinal_encoder = None def fit(self, X, y=None, **kwargs): """ :param X: :param y: :param kwargs: :return: """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose, cols=self.cols) self.ordinal_encoder = self.ordinal_encoder.fit(X) # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5] return self def transform(self, X): """ :param X: :return: """ if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) if not self.cols: return X X = self.ordinal_encoder.transform(X) X = sum_coding(X, cols=self.cols) if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) if self.return_df: return X else: return X.values
class OneHotEncoder(BaseEstimator, TransformerMixin): """Onehot (or dummy) coding for categorical features, produces one feature per category, each binary. Parameters ---------- verbose: int integer indicating verbosity of output. 0 for none. cols: list a list of columns to encode, if None, all string columns will be encoded drop_invariant: bool boolean for whether or not to drop columns with 0 variance return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array) impute_missing: bool boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future. handle_unknown: str options are 'error', 'ignore' and 'impute', defaults to 'impute', which will impute the category -1. Warning: if impute is used, an extra column will be added in if the transform matrix has unknown categories. This can causes unexpected changes in dimension in some cases. Example ------- >>>from category_encoders import * >>>import pandas as pd >>>from sklearn.datasets import load_boston >>>bunch = load_boston() >>>y = bunch.target >>>X = pd.DataFrame(bunch.data, columns=bunch.feature_names) >>>enc = OneHotEncoder(cols=['CHAS', 'RAD']).fit(X, y) >>>numeric_dataset = enc.transform(X) >>>print(numeric_dataset.info()) <class 'pandas.core.frame.DataFrame'> RangeIndex: 506 entries, 0 to 505 Data columns (total 22 columns): CHAS_0 506 non-null int64 CHAS_1 506 non-null int64 RAD_0 506 non-null int64 RAD_1 506 non-null int64 RAD_2 506 non-null int64 RAD_3 506 non-null int64 RAD_4 506 non-null int64 RAD_5 506 non-null int64 RAD_6 506 non-null int64 RAD_7 506 non-null int64 RAD_8 506 non-null int64 CRIM 506 non-null float64 ZN 506 non-null float64 INDUS 506 non-null float64 NOX 506 non-null float64 RM 506 non-null float64 AGE 506 non-null float64 DIS 506 non-null float64 TAX 506 non-null float64 PTRATIO 506 non-null float64 B 506 non-null float64 LSTAT 506 non-null float64 dtypes: float64(11), int64(11) memory usage: 87.0 KB None References ---------- .. [1] Contrast Coding Systems for categorical variables. UCLA: Statistical Consulting Group. from https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/. .. [2] Gregory Carey (2003). Coding Categorical Variables, from http://psych.colorado.edu/~carey/Courses/PSYC5741/handouts/Coding%20Categorical%20Variables%202006-03-03.pdf """ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, impute_missing=True, handle_unknown='impute'): self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] self.verbose = verbose self.cols = cols self.ordinal_encoder = None self._dim = None self.impute_missing = impute_missing self.handle_unknown = handle_unknown @property def category_mapping(self): return self.ordinal_encoder.category_mapping def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown ) self.ordinal_encoder = self.ordinal_encoder.fit(X) if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5] return self def transform(self, X): """Perform the transformation to new categorical data. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') # first check the type X = convert_input(X) # then make sure that it is the right size if X.shape[1] != self._dim: raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim, )) if not self.cols: return X if self.return_df else X.values X = self.ordinal_encoder.transform(X) X = self.get_dummies(X, cols=self.cols) if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) if self.return_df: return X else: return X.values def inverse_transform(self, Xt): """ Perform the inverse transformation to encoded data. Parameters ---------- X_in : array-like, shape = [n_samples, n_features] Returns ------- p: array, the same size of X_in """ X = Xt.copy(deep=True) # first check the type X = convert_input(X) if self._dim is None: raise ValueError('Must train encoder before it can be used to inverse_transform data') X = self.reverse_dummies(X, self.cols) # then make sure that it is the right size if X.shape[1] != self._dim: if self.drop_invariant: raise ValueError("Unexpected input dimension %d, the attribute drop_invariant should " "set as False when transform data"%(X.shape[1],)) else: raise ValueError('Unexpected input dimension %d, expected %d'% (X.shape[1], self._dim, )) if not self.cols: return X if self.return_df else X.values if self.impute_missing and self.handle_unknown == 'impute': for col in self.cols: if any(X[col] == -1): raise ValueError("inverse_transform is not supported because transform impute " "the unknown category -1 when encode %s"%(col,)) for switch in self.ordinal_encoder.mapping: col_dict = {col_pair[1] : col_pair[0] for col_pair in switch.get('mapping')} X[switch.get('col')] = X[switch.get('col')].apply(lambda x:col_dict.get(x)) return X if self.return_df else X.values def get_dummies(self, X_in, cols=None): """ Convert numerical variable into dummy variables Parameters ---------- X_in: DataFrame cols: list-like, default None Column names in the DataFrame to be encoded Returns ------- dummies : DataFrame """ X = X_in.copy(deep=True) if cols is None: cols = X.columns.values pass_thru = [] else: pass_thru = [col for col in X.columns.values if col not in cols] bin_cols = [] for col in cols: col_tuples = [class_map['mapping'] for class_map in self.ordinal_encoder.mapping if class_map['col'] == col][0] fit_classes = [col_val[1] for col_val in col_tuples] if self.handle_unknown == 'impute': fit_classes.append(-1) for class_ in fit_classes: n_col_name = str(col) + '_%s' % (class_, ) X[n_col_name] = X[col] == class_ bin_cols.append(n_col_name) X = X.reindex(columns=bin_cols + pass_thru) # convert all of the bools into integers. for col in bin_cols: X[col] = X[col].astype(int) return X def reverse_dummies(self, X, cols): """ Convert dummy variable into numerical variables Parameters ---------- X : DataFrame cols: list-like Column names in the DataFrame that be encoded Returns ------- numerical: DataFrame """ out_cols = X.columns.values for col in cols: col_list = [col0 for col0 in out_cols if col0.startswith(col)] value_array = np.array([int(col0.split('_')[1]) for col0 in col_list]) X[col] = np.dot(X[col_list].values, value_array.T) out_cols = [col0 for col0 in out_cols if col0 not in col_list] X = X.reindex(columns=out_cols + cols) return X
class OneHotEncoder(BaseEstimator, TransformerMixin): """Onehot (or dummy) coding for categorical features, produces one feature per category, each binary. Parameters ---------- verbose: int integer indicating verbosity of output. 0 for none. cols: list a list of columns to encode, if None, all string columns will be encoded. drop_invariant: bool boolean for whether or not to drop columns with 0 variance. return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). use_cat_names: bool if True, category values will be included in the encoded column names. Since this can result into duplicate column names, duplicates are suffixed with '#' symbol until a unique name is generated. If False, category indices will be used instead of the category values. handle_unknown: str options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. Warning: if indicator is used, an extra column will be added in if the transform matrix has unknown categories. This can cause unexpected changes in dimension in some cases. handle_missing: str options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. Warning: if indicator is used, an extra column will be added in if the transform matrix has nan values. This can cause unexpected changes in dimension in some cases. Example ------- >>> from category_encoders import * >>> import pandas as pd >>> from sklearn.datasets import load_boston >>> bunch = load_boston() >>> y = bunch.target >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names) >>> enc = OneHotEncoder(cols=['CHAS', 'RAD'], handle_unknown='indicator').fit(X, y) >>> numeric_dataset = enc.transform(X) >>> print(numeric_dataset.info()) <class 'pandas.core.frame.DataFrame'> RangeIndex: 506 entries, 0 to 505 Data columns (total 24 columns): CRIM 506 non-null float64 ZN 506 non-null float64 INDUS 506 non-null float64 CHAS_1 506 non-null int64 CHAS_2 506 non-null int64 CHAS_-1 506 non-null int64 NOX 506 non-null float64 RM 506 non-null float64 AGE 506 non-null float64 DIS 506 non-null float64 RAD_1 506 non-null int64 RAD_2 506 non-null int64 RAD_3 506 non-null int64 RAD_4 506 non-null int64 RAD_5 506 non-null int64 RAD_6 506 non-null int64 RAD_7 506 non-null int64 RAD_8 506 non-null int64 RAD_9 506 non-null int64 RAD_-1 506 non-null int64 TAX 506 non-null float64 PTRATIO 506 non-null float64 B 506 non-null float64 LSTAT 506 non-null float64 dtypes: float64(11), int64(13) memory usage: 95.0 KB None References ---------- .. [1] Contrast Coding Systems for categorical variables. UCLA: Statistical Consulting Group. from https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/. .. [2] Gregory Carey (2003). Coding Categorical Variables, from http://psych.colorado.edu/~carey/Courses/PSYC5741/handouts/Coding%20Categorical%20Variables%202006-03-03.pdf """ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, handle_missing='value', handle_unknown='value', use_cat_names=False): self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] self.mapping = None self.verbose = verbose self.cols = cols self.ordinal_encoder = None self._dim = None self.handle_unknown = handle_unknown self.handle_missing = handle_missing self.use_cat_names = use_cat_names self.feature_names = None @property def category_mapping(self): return self.ordinal_encoder.category_mapping def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = util.convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, handle_unknown='value', handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) self.mapping = self.generate_mapping() X_temp = self.transform(X, override_return_df=True) self.feature_names = list(X_temp.columns) if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self def generate_mapping(self): mapping = [] found_column_counts = {} for switch in self.ordinal_encoder.mapping: col = switch.get('col') values = switch.get('mapping').copy(deep=True) if self.handle_missing == 'value': values = values[values > 0] if len(values) == 0: continue index = [] new_columns = [] for cat_name, class_ in values.iteritems(): if self.use_cat_names: n_col_name = str(col) + '_%s' % (cat_name,) found_count = found_column_counts.get(n_col_name, 0) found_column_counts[n_col_name] = found_count + 1 n_col_name += '#' * found_count else: n_col_name = str(col) + '_%s' % (class_,) index.append(class_) new_columns.append(n_col_name) if self.handle_unknown == 'indicator': n_col_name = str(col) + '_%s' % (-1,) if self.use_cat_names: found_count = found_column_counts.get(n_col_name, 0) found_column_counts[n_col_name] = found_count + 1 n_col_name += '#' * found_count new_columns.append(n_col_name) index.append(-1) base_matrix = np.eye(N=len(index), dtype=np.int) base_df = pd.DataFrame(data=base_matrix, columns=new_columns, index=index) if self.handle_unknown == 'value': base_df.loc[-1] = 0 elif self.handle_unknown == 'return_nan': base_df.loc[-1] = np.nan if self.handle_missing == 'return_nan': base_df.loc[values.loc[np.nan]] = np.nan elif self.handle_missing == 'value': base_df.loc[-2] = 0 mapping.append({'col': col, 'mapping': base_df}) return mapping def transform(self, X, override_return_df=False): """Perform the transformation to new categorical data. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') if self._dim is None: raise ValueError( 'Must train encoder before it can be used to transform data.') # first check the type X = util.convert_input(X) # then make sure that it is the right size if X.shape[1] != self._dim: raise ValueError('Unexpected input dimension %d, expected %d' % ( X.shape[1], self._dim, )) if not self.cols: return X if self.return_df else X.values X = self.ordinal_encoder.transform(X) if self.handle_unknown == 'error': if X[self.cols].isin([-1]).any().any(): raise ValueError('Columns to be encoded can not contain new values') X = self.get_dummies(X) if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) if self.return_df or override_return_df: return X else: return X.values def inverse_transform(self, X_in): """ Perform the inverse transformation to encoded data. Parameters ---------- X_in : array-like, shape = [n_samples, n_features] Returns ------- p: array, the same size of X_in """ X = X_in.copy(deep=True) # first check the type X = util.convert_input(X) if self._dim is None: raise ValueError( 'Must train encoder before it can be used to inverse_transform data') X = self.reverse_dummies(X, self.mapping) # then make sure that it is the right size if X.shape[1] != self._dim: if self.drop_invariant: raise ValueError("Unexpected input dimension %d, the attribute drop_invariant should " "set as False when transform data" % (X.shape[1],)) else: raise ValueError('Unexpected input dimension %d, expected %d' % ( X.shape[1], self._dim, )) if not self.cols: return X if self.return_df else X.values for switch in self.ordinal_encoder.mapping: column_mapping = switch.get('mapping') inverse = pd.Series(data=column_mapping.index, index=column_mapping.get_values()) X[switch.get('col')] = X[switch.get('col')].map(inverse).astype(switch.get('data_type')) if self.handle_unknown == 'return_nan' and self.handle_missing == 'return_nan': for col in self.cols: if X[switch.get('col')].isnull().any(): warnings.warn("inverse_transform is not supported because transform impute " "the unknown category nan when encode %s" % (col,)) return X if self.return_df else X.values def get_dummies(self, X_in): """ Convert numerical variable into dummy variables Parameters ---------- X_in: DataFrame mapping: list-like Contains mappings of column to be transformed to it's new columns and value represented Returns ------- dummies : DataFrame """ X = X_in.copy(deep=True) cols = X.columns.values.tolist() for switch in self.mapping: col = switch.get('col') mod = switch.get('mapping') base_df = mod.reindex(X[col]) base_df = base_df.set_index(X.index) X = pd.concat([base_df, X], axis=1) old_column_index = cols.index(col) cols[old_column_index: old_column_index + 1] = mod.columns X = X.reindex(columns=cols) return X def reverse_dummies(self, X, mapping): """ Convert dummy variable into numerical variables Parameters ---------- X : DataFrame mapping: list-like Contains mappings of column to be transformed to it's new columns and value represented Returns ------- numerical: DataFrame """ out_cols = X.columns.values.tolist() mapped_columns = [] for switch in mapping: col = switch.get('col') mod = switch.get('mapping') insert_at = out_cols.index(mod.columns[0]) X.insert(insert_at, col, 0) positive_indexes = mod.index[mod.index > 0] for i in range(positive_indexes.shape[0]): existing_col = mod.columns[i] val = positive_indexes[i] X.loc[X[existing_col] == 1, col] = val mapped_columns.append(existing_col) X.drop(mod.columns, axis=1, inplace=True) out_cols = X.columns.values.tolist() return X def get_feature_names(self): """ Returns the names of all transformed / added columns. Returns -------- feature_names: list A list with all feature names transformed or added. Note: potentially dropped features are not included! """ if not isinstance(self.feature_names, list): raise ValueError( 'Must transform data first. Affected feature names are not known before.') else: return self.feature_names
class BaseNEncoder(BaseEstimator, TransformerMixin): """Base-N encoder encodes the categories into arrays of their base-N representation. A base of 1 is equivalent to one-hot encoding (not really base-1, but useful), a base of 2 is equivalent to binary encoding. N=number of actual categories is equivalent to vanilla ordinal encoding. Parameters ---------- verbose: int integer indicating verbosity of output. 0 for none. cols: list a list of columns to encode, if None, all string columns will be encoded drop_invariant: bool boolean for whether or not to drop columns with 0 variance return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array) impute_missing: bool boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future. handle_unknown: str options are 'error', 'ignore' and 'impute', defaults to 'impute', which will impute the category -1. Warning: if impute is used, an extra column will be added in if the transform matrix has unknown categories. This can causes unexpected changes in dimension in some cases. Example ------- >>>from category_encoders import * >>>import pandas as pd >>>from sklearn.datasets import load_boston >>>bunch = load_boston() >>>y = bunch.target >>>X = pd.DataFrame(bunch.data, columns=bunch.feature_names) >>>enc = BaseNEncoder(cols=['CHAS', 'RAD']).fit(X, y) >>>numeric_dataset = enc.transform(X) >>>print(numeric_dataset.info()) <class 'pandas.core.frame.DataFrame'> RangeIndex: 506 entries, 0 to 505 Data columns (total 16 columns): CHAS_0 506 non-null int64 RAD_0 506 non-null int64 RAD_1 506 non-null int64 RAD_2 506 non-null int64 RAD_3 506 non-null int64 CRIM 506 non-null float64 ZN 506 non-null float64 INDUS 506 non-null float64 NOX 506 non-null float64 RM 506 non-null float64 AGE 506 non-null float64 DIS 506 non-null float64 TAX 506 non-null float64 PTRATIO 506 non-null float64 B 506 non-null float64 LSTAT 506 non-null float64 dtypes: float64(11), int64(5) memory usage: 63.3 KB None """ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, base=2, impute_missing=True, handle_unknown='impute'): self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] self.verbose = verbose self.impute_missing = impute_missing self.handle_unknown = handle_unknown self.cols = cols self.ordinal_encoder = None self._dim = None self.base = base self._encoded_columns = None self.digits_per_col = {} def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) # first check the type X = convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown ) self.ordinal_encoder = self.ordinal_encoder.fit(X) for col in self.cols: self.digits_per_col[col] = self.calc_required_digits(X, col) # do a transform on the training data to get a column list X_t = self.transform(X, override_return_df=True) self._encoded_columns = X_t.columns.values # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5] return self def transform(self, X, override_return_df=False): """Perform the transformation to new categorical data. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') # first check the type X = convert_input(X) # then make sure that it is the right size if X.shape[1] != self._dim: raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,)) if not self.cols: return X X = self.ordinal_encoder.transform(X) X = self.basen_encode(X, cols=self.cols) if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) X.fillna(0.0, inplace=True) if self.return_df or override_return_df: return X else: return X.values def inverse_transform(self, Xt): """ Perform the inverse transformation to encoded data. Parameters ---------- X_in : array-like, shape = [n_samples, n_features] Returns ------- p: array, the same size of X_in """ warnings.warn('Inverse transform in basen is a currently experimental feature, please be careful') X = Xt.copy(deep=True) # first check the type X = convert_input(X) if self._dim is None: raise ValueError('Must train encoder before it can be used to inverse_transform data') X = self.basen_to_interger(X, self.cols, self.base) # then make sure that it is the right size if X.shape[1] != self._dim: if self.drop_invariant: raise ValueError("Unexpected input dimension %d, the attribute drop_invariant should " "set as False when transform data" % (X.shape[1],)) else: raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,)) if not self.cols: return X if self.return_df else X.values if self.impute_missing and self.handle_unknown == 'impute': for col in self.cols: if any(X[col] == -1): raise ValueError("inverse_transform is not supported because transform impute " "the unknown category -1 when encode %s" % (col,)) for switch in self.ordinal_encoder.mapping: col_dict = {col_pair[1]: col_pair[0] for col_pair in switch.get('mapping')} X[switch.get('col')] = X[switch.get('col')].apply(lambda x: col_dict.get(x)) return X if self.return_df else X.values def calc_required_digits(self, X, col): # figure out how many digits we need to represent the classes present if self.base == 1: digits = len(X[col].unique()) else: digits = int(np.ceil(math.log(len(X[col].unique()), self.base))) return digits def basen_encode(self, X_in, cols=None): """ Basen encoding encodes the integers as basen code with one column per digit. Parameters ---------- X_in: DataFrame cols: list-like, default None Column names in the DataFrame to be encoded Returns ------- dummies : DataFrame """ X = X_in.copy(deep=True) if cols is None: cols = X.columns.values pass_thru = [] else: pass_thru = [col for col in X.columns.values if col not in cols] bin_cols = [] for col in cols: # get how many digits we need to represent the classes present digits = self.calc_required_digits(X, col) # map the ordinal column into a list of these digits, of length digits X[col] = X[col].map(lambda x: self.col_transform(x, digits)) for dig in range(digits): X[str(col) + '_%d' % (dig,)] = X[col].map(lambda r: int(r[dig]) if r is not None else None) bin_cols.append(str(col) + '_%d' % (dig,)) if self._encoded_columns is None: X = X.reindex(columns=bin_cols + pass_thru) else: X = X.reindex(columns=self._encoded_columns) return X def basen_to_interger(self, X, cols, base): """ Convert basen code as integers. Parameters ---------- X : DataFrame encoded data cols : list-like Column names in the DataFrame that be encoded base : int The base of transform Returns ------- numerical: DataFrame """ out_cols = X.columns.values for col in cols: col_list = [col0 for col0 in out_cols if col0.startswith(col)] for col0 in col_list: if any(X[col0].isnull()): raise ValueError("inverse_transform is not supported because transform impute" "the unknown category -1 when encode %s" % (col,)) if base == 1: value_array = np.array([int(col0.split('_')[1]) for col0 in col_list]) else: len0 = len(col_list) value_array = np.array([base ** (len0 - 1 - i) for i in range(len0)]) X[col] = np.dot(X[col_list].values, value_array.T) out_cols = [col0 for col0 in out_cols if col0 not in col_list] X = X.reindex(columns=out_cols + cols) return X def col_transform(self, col, digits): """ The lambda body to transform the column values """ if col is None or float(col) < 0.0: return None else: col = self.numberToBase(int(col), self.base, digits) if len(col) == digits: return col else: return [0 for _ in range(digits - len(col))] + col @staticmethod def numberToBase(n, b, limit): if b == 1: return [0 if n != _ else 1 for _ in range(limit)] if n == 0: return [0 for _ in range(limit)] digits = [] for _ in range(limit): digits.append(int(n % b)) n, _ = divmod(n, b) return digits[::-1]
class BinaryEncoder(BaseEstimator, TransformerMixin): """ Parameters ---------- verbose: int integer indicating verbosity of output. 0 for none. cols: list a list of columns to encode, if None, all string columns will be encoded drop_invariant: bool boolean for whether or not to drop columns with 0 variance return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array) Example ------- >>> from category_encoders import BinaryEncoder >>> from sklearn.datasets import fetch_20newsgroups_vectorized >>> bunch = fetch_20newsgroups_vectorized(subset="all") >>> X, y = bunch.data, bunch.target >>> enc = BinaryEncoderr(return_df=False).fit(X, y) >>> numeric_dataset = enc.transform(X) """ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True): self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] self.verbose = verbose self.cols = cols self.ordinal_encoder = None self._dim = None def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) # first check the type if not isinstance(X, pd.DataFrame): if isinstance(X, list): X = pd.DataFrame(np.array(X)) elif isinstance(X, (np.generic, np.ndarray)): X = pd.DataFrame(X) else: raise ValueError('Unexpected input type: %s' % (str(type(X)))) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose, cols=self.cols) self.ordinal_encoder = self.ordinal_encoder.fit(X) # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5] return self def transform(self, X): """Perform the transformation to new categorical data. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') # first check the type if not isinstance(X, pd.DataFrame): if isinstance(X, list): X = pd.DataFrame(np.array(X)) elif isinstance(X, (np.generic, np.ndarray)): X = pd.DataFrame(X) else: raise ValueError('Unexpected input type: %s' % (str(type(X)))) # then make sure that it is the right size if X.shape[1] != self._dim: raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim, )) if not self.cols: return X X = self.ordinal_encoder.transform(X) X = self.binary(X, cols=self.cols) if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) if self.return_df: return X else: return X.values def binary(self, X_in, cols=None): """ Binary encoding encodes the integers as binary code with one column per digit. :param X: :return: """ X = X_in.copy(deep=True) if cols is None: cols = X.columns.values pass_thru = [] else: pass_thru = [col for col in X.columns.values if col not in cols] bin_cols = [] for col in cols: # figure out how many digits we need to represent the classes present digits = int(np.ceil(np.log2(len(X[col].unique())))) # map the ordinal column into a list of these digits, of length digits X[col] = X[col].map(lambda x: self.col_transform(x, digits)) for dig in range(digits): X[col + '_%d' % (dig, )] = X[col].map(lambda x: int(x[dig]) if x is not None else None) bin_cols.append(col + '_%d' % (dig, )) X = X.reindex(columns=bin_cols + pass_thru) return X @staticmethod def col_transform(col, digits): """ The lambda body to transform the column values :param col: :return: """ if col is None or float(col) < 0.0: return None else: col = list("{0:b}".format(int(col))) if len(col) == digits: return col else: return [0 for _ in range(digits - len(col))] + col
def fit(self, X, y, **kwargs): """Fit encoder according to X and binary y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Binary target values. Returns ------- self : encoder Returns self. """ # Unite parameters into pandas types X = util.convert_input(X) y = util.convert_input_vector(y, X.index).astype(float) # The lengths must be equal if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") # The label must be binary with values {0,1} unique = y.unique() if len(unique) != 2: raise ValueError("The target column y must be binary. But the target contains " + str(len(unique)) + " unique value(s).") if y.isnull().any(): raise ValueError("The target column y must not contain missing values.") if np.max(unique) < 1: raise ValueError("The target column y must be binary with values {0, 1}. Value 1 was not found in the target.") if np.min(unique) > 0: raise ValueError("The target column y must be binary with values {0, 1}. Value 0 was not found in the target.") self._dim = X.shape[1] # If columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, handle_unknown='value', handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) X_ordinal = self.ordinal_encoder.transform(X) # Training self.mapping = self._train(X_ordinal, y) X_temp = self.transform(X, override_return_df=True) self.feature_names = X_temp.columns.tolist() # Store column names with approximately constant variance on the training data if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self
class BinaryEncoder(BaseEstimator, TransformerMixin): """ Binary encoding encodes the integers as binary code with one column per digit. """ def __init__(self, verbose=0, cols=None, drop_invariant=False): """ :param verbose: :param cols: :return: """ self.drop_invariant = drop_invariant self.drop_cols = [] self.verbose = verbose self.cols = cols self.ordinal_encoder = OrdinalEncoder(verbose=verbose, cols=cols) def fit(self, X, y=None, **kwargs): """ :param X: :param y: :param kwargs: :return: """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) # train an ordinal pre-encoder self.ordinal_encoder = self.ordinal_encoder.fit(X) # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5] return self def transform(self, X): """ :param X: :return: """ if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) if self.cols == []: return X X = self.ordinal_encoder.transform(X) X = binary(X, cols=self.cols) if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) return X
class BackwardDifferenceEncoder(BaseEstimator, TransformerMixin): """Backward difference contrast coding for encoding categorical variables. Parameters ---------- verbose: int integer indicating verbosity of output. 0 for none. cols: list a list of columns to encode, if None, all string columns will be encoded drop_invariant: bool boolean for whether or not to drop columns with 0 variance return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array) impute_missing: bool boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future. handle_unknown: str options are 'error', 'ignore' and 'impute', defaults to 'impute', which will impute the category -1. Warning: if impute is used, an extra column will be added in if the transform matrix has unknown categories. This can causes unexpected changes in dimension in some cases. Example ------- >>>from category_encoders import * >>>import pandas as pd >>>from sklearn.datasets import load_boston >>>bunch = load_boston() >>>y = bunch.target >>>X = pd.DataFrame(bunch.data, columns=bunch.feature_names) >>>enc = BackwardDifferenceEncoder(cols=['CHAS', 'RAD']).fit(X, y) >>>numeric_dataset = enc.transform(X) >>>print(numeric_dataset.info()) <class 'pandas.core.frame.DataFrame'> RangeIndex: 506 entries, 0 to 505 Data columns (total 22 columns): col_CHAS_0 506 non-null float64 col_CHAS_1 506 non-null float64 col_RAD_0 506 non-null float64 col_RAD_1 506 non-null float64 col_RAD_2 506 non-null float64 col_RAD_3 506 non-null float64 col_RAD_4 506 non-null float64 col_RAD_5 506 non-null float64 col_RAD_6 506 non-null float64 col_RAD_7 506 non-null float64 col_RAD_8 506 non-null float64 col_CRIM 506 non-null float64 col_ZN 506 non-null float64 col_INDUS 506 non-null float64 col_NOX 506 non-null float64 col_RM 506 non-null float64 col_AGE 506 non-null float64 col_DIS 506 non-null float64 col_TAX 506 non-null float64 col_PTRATIO 506 non-null float64 col_B 506 non-null float64 col_LSTAT 506 non-null float64 dtypes: float64(22) memory usage: 87.0 KB None References ---------- .. [1] Contrast Coding Systems for categorical variables. UCLA: Statistical Consulting Group. from https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/. .. [2] Gregory Carey (2003). Coding Categorical Variables, from http://psych.colorado.edu/~carey/Courses/PSYC5741/handouts/Coding%20Categorical%20Variables%202006-03-03.pdf """ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, impute_missing=True, handle_unknown='impute'): self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] self.verbose = verbose self.impute_missing = impute_missing self.handle_unknown = handle_unknown self.cols = cols self.ordinal_encoder = None self._dim = None def fit(self, X, y=None, **kwargs): """Fits an ordinal encoder to produce a consistent mapping across applications and optionally finds generally invariant columns to drop consistently. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) # first check the type X = convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown) self.ordinal_encoder = self.ordinal_encoder.fit(X) # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [ x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5 ] return self def transform(self, X): """Perform the transformation to new categorical data. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ if self._dim is None: raise ValueError( 'Must train encoder before it can be used to transform data.') # first check the type X = convert_input(X) # then make sure that it is the right size if X.shape[1] != self._dim: raise ValueError('Unexpected input dimension %d, expected %d' % ( X.shape[1], self._dim, )) if not self.cols: return X X = self.ordinal_encoder.transform(X) X = self.backward_difference_coding(X, cols=self.cols) if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) if self.return_df: return X else: return X.values @staticmethod def backward_difference_coding(X_in, cols=None): """ """ X = X_in.copy(deep=True) X.columns = ['col_' + str(x) for x in X.columns.values] cols = ['col_' + str(x) for x in cols] if cols is None: cols = X.columns.values pass_thru = [] else: pass_thru = [col for col in X.columns.values if col not in cols] bin_cols = [] for col in cols: mod = dmatrix("C(%s, Diff)" % (col, ), X) for dig in range(len(mod[0])): X[str(col) + '_%d' % (dig, )] = mod[:, dig] bin_cols.append(str(col) + '_%d' % (dig, )) X = X.reindex(columns=bin_cols + pass_thru) X.fillna(0.0) return X
class HelmertEncoder(BaseEstimator, TransformerMixin): """ Parameters ---------- verbose: int integer indicating verbosity of output. 0 for none. cols: list a list of columns to encode, if None, all string columns will be encoded drop_invariant: bool boolean for whether or not to drop columns with 0 variance return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array) Example ------- >>> from category_encoders import HelmertEncoder >>> from sklearn.datasets import fetch_20newsgroups_vectorized >>> bunch = fetch_20newsgroups_vectorized(subset="all") >>> X, y = bunch.data, bunch.target >>> enc = HelmertEncoder(return_df=False).fit(X, y) >>> numeric_dataset = enc.transform(X) References ---------- .. [1] Contrast Coding Systems for categorical variables. UCLA: Statistical Consulting Group. from http://www.ats.ucla.edu/stat/r/library/contrast_coding. .. [2] Gregory Carey (2003). Coding Categorical Variables, from http://psych.colorado.edu/~carey/Courses/PSYC5741/handouts/Coding%20Categorical%20Variables%202006-03-03.pdf """ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True): self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] self.verbose = verbose self.cols = cols self.ordinal_encoder = None self._dim = None def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type if not isinstance(X, pd.DataFrame): if isinstance(X, list): X = pd.DataFrame(np.array(X)) elif isinstance(X, (np.generic, np.ndarray)): X = pd.DataFrame(X) else: raise ValueError('Unexpected input type: %s' % (str(type(X)))) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose, cols=self.cols) self.ordinal_encoder = self.ordinal_encoder.fit(X) if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5] return self def transform(self, X): """Perform the transformation to new categorical data. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') # first check the type if not isinstance(X, pd.DataFrame): if isinstance(X, list): X = pd.DataFrame(np.array(X)) elif isinstance(X, (np.generic, np.ndarray)): X = pd.DataFrame(X) else: raise ValueError('Unexpected input type: %s' % (str(type(X)))) # then make sure that it is the right size if X.shape[1] != self._dim: raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim, )) if not self.cols: return X X = self.ordinal_encoder.transform(X) X = self.helmert_coding(X, cols=self.cols) if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) if self.return_df: return X else: return X.values @staticmethod def helmert_coding(X_in, cols=None): """ :param X: :return: """ X = X_in.copy(deep=True) if cols is None: cols = X.columns.values pass_thru = [] else: pass_thru = [col for col in X.columns.values if col not in cols] bin_cols = [] for col in cols: mod = dmatrix("C(%s, Helmert)" % (col, ), X) for dig in range(len(mod[0])): X[col + '_%d' % (dig, )] = mod[:, dig] bin_cols.append(col + '_%d' % (dig, )) X = X.reindex(columns=bin_cols + pass_thru) return X
class SumEncoder(BaseEstimator, TransformerMixin): """Sum contrast coding for the encoding of categorical features. Parameters ---------- verbose: int integer indicating verbosity of output. 0 for none. cols: list a list of columns to encode, if None, all string columns will be encoded. drop_invariant: bool boolean for whether or not to drop columns with 0 variance. return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). handle_unknown: str options are 'error', 'return_nan' and 'value', defaults to 'value'. Warning: if value is used, an extra column will be added in if the transform matrix has unknown categories. This can cause unexpected changes in the dimension in some cases. handle_missing: str options are 'error', 'return_nan', 'value', and 'indicator', defaults to 'indicator'. Warning: if indicator is used, an extra column will be added in if the transform matrix has unknown categories. This can cause unexpected changes in dimension in some cases. Example ------- >>> from category_encoders import * >>> import pandas as pd >>> from sklearn.datasets import load_boston >>> bunch = load_boston() >>> y = bunch.target >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names) >>> enc = SumEncoder(cols=['CHAS', 'RAD']).fit(X, y) >>> numeric_dataset = enc.transform(X) >>> print(numeric_dataset.info()) <class 'pandas.core.frame.DataFrame'> RangeIndex: 506 entries, 0 to 505 Data columns (total 21 columns): intercept 506 non-null int64 CRIM 506 non-null float64 ZN 506 non-null float64 INDUS 506 non-null float64 CHAS_0 506 non-null float64 NOX 506 non-null float64 RM 506 non-null float64 AGE 506 non-null float64 DIS 506 non-null float64 RAD_0 506 non-null float64 RAD_1 506 non-null float64 RAD_2 506 non-null float64 RAD_3 506 non-null float64 RAD_4 506 non-null float64 RAD_5 506 non-null float64 RAD_6 506 non-null float64 RAD_7 506 non-null float64 TAX 506 non-null float64 PTRATIO 506 non-null float64 B 506 non-null float64 LSTAT 506 non-null float64 dtypes: float64(20), int64(1) memory usage: 83.1 KB None References ---------- .. [1] Contrast Coding Systems for categorical variables. UCLA: Statistical Consulting Group. from https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/. .. [2] Gregory Carey (2003). Coding Categorical Variables, from http://psych.colorado.edu/~carey/Courses/PSYC5741/handouts/Coding%20Categorical%20Variables%202006-03-03.pdf """ def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, handle_unknown='value', handle_missing='value'): self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] self.verbose = verbose self.mapping = mapping self.handle_unknown = handle_unknown self.handle_missing=handle_missing self.cols = cols self.ordinal_encoder = None self._dim = None self.feature_names = None def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) # first check the type X = util.convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, handle_unknown='value', handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) ordinal_mapping = self.ordinal_encoder.category_mapping mappings_out = [] for switch in ordinal_mapping: values = switch.get('mapping') col = switch.get('col') column_mapping = self.fit_sum_coding(col, values, self.handle_missing, self.handle_unknown) mappings_out.append({'col': switch.get('col'), 'mapping': column_mapping, }) self.mapping = mappings_out X_temp = self.transform(X, override_return_df=True) self.feature_names = X_temp.columns.tolist() # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self def transform(self, X, override_return_df=False): """Perform the transformation to new categorical data. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') # first check the type X = util.convert_input(X) # then make sure that it is the right size if X.shape[1] != self._dim: raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim, )) if not self.cols: return X X = self.ordinal_encoder.transform(X) if self.handle_unknown == 'error': if X[self.cols].isin([-1]).any().any(): raise ValueError('Columns to be encoded can not contain new values') X = self.sum_coding(X, mapping=self.mapping) if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) if self.return_df or override_return_df: return X else: return X.values @staticmethod def fit_sum_coding(col, values, handle_missing, handle_unknown): if handle_missing == 'value': values = values[values > 0] values_to_encode = values.get_values() if len(values) < 2: return pd.DataFrame(index=values_to_encode) if handle_unknown == 'indicator': values_to_encode = np.append(values_to_encode, -1) sum_contrast_matrix = Sum().code_without_intercept(values_to_encode.tolist()) df = pd.DataFrame(data=sum_contrast_matrix.matrix, index=values_to_encode, columns=[str(col) + '_%d' % (i, ) for i in range(len(sum_contrast_matrix.column_suffixes))]) if handle_unknown == 'return_nan': df.loc[-1] = np.nan elif handle_unknown == 'value': df.loc[-1] = np.zeros(len(values_to_encode) - 1) if handle_missing == 'return_nan': df.loc[values.loc[np.nan]] = np.nan elif handle_missing == 'value': df.loc[-2] = np.zeros(len(values_to_encode) - 1) return df @staticmethod def sum_coding(X_in, mapping): """ """ X = X_in.copy(deep=True) cols = X.columns.values.tolist() X['intercept'] = pd.Series([1] * X.shape[0], index=X.index) for switch in mapping: col = switch.get('col') mod = switch.get('mapping') base_df = mod.loc[X[col]] base_df.set_index(X.index, inplace=True) X = pd.concat([base_df, X], axis=1) old_column_index = cols.index(col) cols[old_column_index: old_column_index + 1] = mod.columns cols = ['intercept'] + cols return X.reindex(columns=cols) def get_feature_names(self): """ Returns the names of all transformed / added columns. Returns: -------- feature_names: list A list with all feature names transformed or added. Note: potentially dropped features are not included! """ if not isinstance(self.feature_names, list): raise ValueError("Estimator has to be fitted to return feature names.") else: return self.feature_names
class TargetEncoder(BaseEstimator, TransformerMixin): def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, handle_missing='value', handle_unknown='value', min_samples_leaf=1, smoothing=1.0): """Target encoding for categorical features. For the case of categorical target: features are replaced with a blend of posterior probability of the target given particular categorical value and prior probability of the target over all the training data. For the case of continuous target: features are replaced with a blend of expected value of the target given particular categorical value and expected value of the target over all the training data. Parameters ---------- verbose: int integer indicating verbosity of output. 0 for none. cols: list a list of columns to encode, if None, all string columns will be encoded. drop_invariant: bool boolean for whether or not to drop columns with 0 variance. return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). handle_unknown: str options are 'error', 'return_nan' and 'value', defaults to 'value', which will impute the target mean. min_samples_leaf: int minimum samples to take category average into account. smoothing: float smoothing effect to balance categorical average vs prior. Higher value means stronger regularization. The value must be strictly bigger than 0. Example ------- >>> from category_encoders import * >>> import pandas as pd >>> from sklearn.datasets import load_boston >>> bunch = load_boston() >>> y = bunch.target >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names) >>> enc = TargetEncoder(cols=['CHAS', 'RAD']).fit(X, y) >>> numeric_dataset = enc.transform(X) >>> print(numeric_dataset.info()) <class 'pandas.core.frame.DataFrame'> RangeIndex: 506 entries, 0 to 505 Data columns (total 13 columns): CRIM 506 non-null float64 ZN 506 non-null float64 INDUS 506 non-null float64 CHAS 506 non-null float64 NOX 506 non-null float64 RM 506 non-null float64 AGE 506 non-null float64 DIS 506 non-null float64 RAD 506 non-null float64 TAX 506 non-null float64 PTRATIO 506 non-null float64 B 506 non-null float64 LSTAT 506 non-null float64 dtypes: float64(13) memory usage: 51.5 KB None References ---------- .. [1] A Preprocessing Scheme for High-Cardinality Categorical Attributes in Classification and Prediction Problems. from https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf. """ self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] self.verbose = verbose self.cols = cols self.ordinal_encoder = None self.min_samples_leaf = min_samples_leaf self.smoothing = float(smoothing) # Make smoothing a float so that python 2 does not treat as integer division self._dim = None self.mapping = None self.handle_unknown = handle_unknown self.handle_missing=handle_missing self._mean = None self.feature_names = None def fit(self, X, y, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # unite the input into pandas types X = util.convert_input(X) y = util.convert_input_vector(y, X.index) if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, handle_unknown='value', handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) X_ordinal = self.ordinal_encoder.transform(X) self.mapping = self.fit_target_encoding(X_ordinal, y) X_temp = self.transform(X, override_return_df=True) self.feature_names = list(X_temp.columns) if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self def fit_target_encoding(self, X, y): mapping = {} for switch in self.ordinal_encoder.category_mapping: col = switch.get('col') values = switch.get('mapping') prior = self._mean = y.mean() stats = y.groupby(X[col]).agg(['count', 'mean']) smoove = 1 / (1 + np.exp(-(stats['count'] - self.min_samples_leaf) / self.smoothing)) smoothing = prior * (1 - smoove) + stats['mean'] * smoove smoothing[stats['count'] == 1] = prior if self.handle_unknown == 'return_nan': smoothing.loc[-1] = np.nan elif self.handle_unknown == 'value': smoothing.loc[-1] = prior if self.handle_missing == 'return_nan': smoothing.loc[values.loc[np.nan]] = np.nan elif self.handle_missing == 'value': smoothing.loc[-2] = prior mapping[col] = smoothing return mapping def transform(self, X, y=None, override_return_df=False): """Perform the transformation to new categorical data. Parameters ---------- X : array-like, shape = [n_samples, n_features] y : array-like, shape = [n_samples] when transform by leave one out None, when transform without target info (such as transform test set) Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') # unite the input into pandas types X = util.convert_input(X) # then make sure that it is the right size if X.shape[1] != self._dim: raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,)) # if we are encoding the training data, we have to check the target if y is not None: y = util.convert_input_vector(y, X.index) if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") if not self.cols: return X X = self.ordinal_encoder.transform(X) if self.handle_unknown == 'error': if X[self.cols].isin([-1]).any().any(): raise ValueError('Unexpected categories found in dataframe') X = self.target_encode(X) if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) if self.return_df or override_return_df: return X else: return X.values def fit_transform(self, X, y=None, **fit_params): """ Encoders that utilize the target must make sure that the training data are transformed with: transform(X, y) and not with: transform(X) """ return self.fit(X, y, **fit_params).transform(X, y) def target_encode(self, X_in): X = X_in.copy(deep=True) for col in self.cols: X[col] = X[col].map(self.mapping[col]) return X def get_feature_names(self): """ Returns the names of all transformed / added columns. Returns: -------- feature_names: list A list with all feature names transformed or added. Note: potentially dropped features are not included! """ if not isinstance(self.feature_names, list): raise ValueError('Must fit data first. Affected feature names are not known before.') else: return self.feature_names
class MEstimateEncoder(BaseEstimator, TransformerMixin): """M-probability estimate of likelihood. This is a simplified version of target encoder. In comparison to target encoder, m-probability estimate has only one tunable parameter ('m'), while target encoder has two tunable parameters ('min_samples_leaf' and 'smoothing'). Parameters ---------- verbose: int integer indicating verbosity of output. 0 for none. cols: list a list of columns to encode, if None, all string columns will be encoded. drop_invariant: bool boolean for whether or not to drop encoded columns with 0 variance. return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). handle_missing: str options are 'return_nan', 'error' and 'value', defaults to 'value', which returns the prior probability. handle_unknown: str options are 'return_nan', 'error' and 'value', defaults to 'value', which returns the prior probability. randomized: bool, adds normal (Gaussian) distribution noise into training data in order to decrease overfitting (testing data are untouched). sigma: float standard deviation (spread or "width") of the normal distribution. m: float this is the "m" in the m-probability estimate. Higher value of m results into stronger shrinking. M is non-negative. Example ------- >>> from category_encoders import * >>> import pandas as pd >>> from sklearn.datasets import load_boston >>> bunch = load_boston() >>> y = bunch.target > 22.5 >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names) >>> enc = MEstimateEncoder(cols=['CHAS', 'RAD']).fit(X, y) >>> numeric_dataset = enc.transform(X) >>> print(numeric_dataset.info()) <class 'pandas.core.frame.DataFrame'> RangeIndex: 506 entries, 0 to 505 Data columns (total 13 columns): CRIM 506 non-null float64 ZN 506 non-null float64 INDUS 506 non-null float64 CHAS 506 non-null float64 NOX 506 non-null float64 RM 506 non-null float64 AGE 506 non-null float64 DIS 506 non-null float64 RAD 506 non-null float64 TAX 506 non-null float64 PTRATIO 506 non-null float64 B 506 non-null float64 LSTAT 506 non-null float64 dtypes: float64(13) memory usage: 51.5 KB None References ---------- .. [1] A Preprocessing Scheme for High-Cardinality Categorical Attributes in Classification and Prediction Problems, equation 7, from https://dl.acm.org/citation.cfm?id=507538. ..[2] Additive smoothing, from https://en.wikipedia.org/wiki/Additive_smoothing#Generalized_to_the_case_of_known_incidence_rates """ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, handle_unknown='value', handle_missing='value', random_state=None, randomized=False, sigma=0.05, m=1.0): self.verbose = verbose self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] self.cols = cols self.ordinal_encoder = None self._dim = None self.mapping = None self.handle_unknown = handle_unknown self.handle_missing = handle_missing self._sum = None self._count = None self.random_state = random_state self.randomized = randomized self.sigma = sigma self.m = m self.feature_names = None # noinspection PyUnusedLocal def fit(self, X, y, **kwargs): """Fit encoder according to X and binary y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Binary target values. Returns ------- self : encoder Returns self. """ # Unite parameters into pandas types X = util.convert_input(X) y = util.convert_input_vector(y, X.index).astype(float) # The lengths must be equal if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") self._dim = X.shape[1] # If columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, handle_unknown='value', handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) X_ordinal = self.ordinal_encoder.transform(X) # Training self.mapping = self._train(X_ordinal, y) X_temp = self.transform(X, override_return_df=True) self.feature_names = X_temp.columns.tolist() # Store column names with approximately constant variance on the training data if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self def transform(self, X, y=None, override_return_df=False): """Perform the transformation to new categorical data. When the data are used for model training, it is important to also pass the target in order to apply leave one out. Parameters ---------- X : array-like, shape = [n_samples, n_features] y : array-like, shape = [n_samples] when transform by leave one out None, when transform without target information (such as transform test set) Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') # Unite the input into pandas types X = util.convert_input(X) # Then make sure that it is the right size if X.shape[1] != self._dim: raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,)) # If we are encoding the training data, we have to check the target if y is not None: y = util.convert_input_vector(y, X.index).astype(float) if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") if not self.cols: return X # Do not modify the input argument X = X.copy(deep=True) X = self.ordinal_encoder.transform(X) if self.handle_unknown == 'error': if X[self.cols].isin([-1]).any().any(): raise ValueError('Unexpected categories found in dataframe') # Loop over columns and replace nominal values with WOE X = self._score(X, y) # Postprocessing # Note: We should not even convert these columns. if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) if self.return_df or override_return_df: return X else: return X.values def fit_transform(self, X, y=None, **fit_params): """ Encoders that utilize the target must make sure that the training data are transformed with: transform(X, y) and not with: transform(X) """ return self.fit(X, y, **fit_params).transform(X, y) def _train(self, X, y): # Initialize the output mapping = {} # Calculate global statistics self._sum = y.sum() self._count = y.count() prior = self._sum/self._count for switch in self.ordinal_encoder.category_mapping: col = switch.get('col') values = switch.get('mapping') # Calculate sum and count of the target for each unique value in the feature col stats = y.groupby(X[col]).agg(['sum', 'count']) # Count of x_{i,+} and x_i # Calculate the m-probability estimate estimate = (stats['sum'] + prior * self.m) / (self._sum + self.m) # Ignore unique values. This helps to prevent overfitting on id-like columns if len(stats['count'])==self._count: estimate[:] = prior if self.handle_unknown == 'return_nan': estimate.loc[-1] = np.nan elif self.handle_unknown == 'value': estimate.loc[-1] = prior if self.handle_missing == 'return_nan': estimate.loc[values.loc[np.nan]] = np.nan elif self.handle_missing == 'value': estimate.loc[-2] = prior # Store the m-probability estimate for transform() function mapping[col] = estimate return mapping def _score(self, X, y): for col in self.cols: # Score the column X[col] = X[col].map(self.mapping[col]) # Randomization is meaningful only for training data -> we do it only if y is present if self.randomized and y is not None: random_state_generator = check_random_state(self.random_state) X[col] = (X[col] * random_state_generator.normal(1., self.sigma, X[col].shape[0])) return X def get_feature_names(self): """ Returns the names of all transformed / added columns. Returns: -------- feature_names: list A list with all feature names transformed or added. Note: potentially dropped features are not included! """ if not isinstance(self.feature_names, list): raise ValueError("Estimator has to be fitted to return feature names.") else: return self.feature_names
class BaseNEncoder(BaseEstimator, TransformerMixin): """Base-N encoder encodes the categories into arrays of their base-N representation. A base of 1 is equivalent to one-hot encoding (not really base-1, but useful), a base of 2 is equivalent to binary encoding. N=number of actual categories is equivalent to vanilla ordinal encoding. Parameters ---------- verbose: int integer indicating verbosity of output. 0 for none. cols: list a list of columns to encode, if None, all string columns will be encoded drop_invariant: bool boolean for whether or not to drop columns with 0 variance return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array) Example ------- >>>from category_encoders import * >>>import pandas as pd >>>from sklearn.datasets import load_boston >>>bunch = load_boston() >>>y = bunch.target >>>X = pd.DataFrame(bunch.data, columns=bunch.feature_names) >>>enc = BaseNEncoder(cols=['CHAS', 'RAD']).fit(X, y) >>>numeric_dataset = enc.transform(X) >>>print(numeric_dataset.info()) <class 'pandas.core.frame.DataFrame'> RangeIndex: 506 entries, 0 to 505 Data columns (total 16 columns): CHAS_0 506 non-null int64 RAD_0 506 non-null int64 RAD_1 506 non-null int64 RAD_2 506 non-null int64 RAD_3 506 non-null int64 CRIM 506 non-null float64 ZN 506 non-null float64 INDUS 506 non-null float64 NOX 506 non-null float64 RM 506 non-null float64 AGE 506 non-null float64 DIS 506 non-null float64 TAX 506 non-null float64 PTRATIO 506 non-null float64 B 506 non-null float64 LSTAT 506 non-null float64 dtypes: float64(11), int64(5) memory usage: 63.3 KB None """ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, base=2): self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] self.verbose = verbose self.cols = cols self.ordinal_encoder = None self._dim = None self.base = base self._encoded_columns = None def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) # first check the type X = convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose, cols=self.cols) self.ordinal_encoder = self.ordinal_encoder.fit(X) # do a transform on the training data to get a column list X_t = self.transform(X, override_return_df=True) self._encoded_columns = X_t.columns.values # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5] return self def transform(self, X, override_return_df=False): """Perform the transformation to new categorical data. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') # first check the type X = convert_input(X) # then make sure that it is the right size if X.shape[1] != self._dim: raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim, )) if not self.cols: return X X = self.ordinal_encoder.transform(X) X = self.basen_encode(X, cols=self.cols) if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) X.fillna(0.0, inplace=True) if self.return_df or override_return_df: return X else: return X.values def basen_encode(self, X_in, cols=None): """ """ X = X_in.copy(deep=True) if cols is None: cols = X.columns.values pass_thru = [] else: pass_thru = [col for col in X.columns.values if col not in cols] bin_cols = [] for col in cols: # figure out how many digits we need to represent the classes present if self.base == 1: digits = len(X[col].unique()) else: digits = int(np.ceil(math.log(len(X[col].unique()), self.base))) # map the ordinal column into a list of these digits, of length digits X[col] = X[col].map(lambda x: self.col_transform(x, digits)) for dig in range(digits): X[str(col) + '_%d' % (dig, )] = X[col].map(lambda r: int(r[dig]) if r is not None else None) bin_cols.append(str(col) + '_%d' % (dig, )) if self._encoded_columns is None: X = X.reindex(columns=bin_cols + pass_thru) else: X = X.reindex(columns=self._encoded_columns) return X def col_transform(self, col, digits): """ The lambda body to transform the column values """ if col is None or float(col) < 0.0: return None else: col = self.numberToBase(int(col), self.base, digits) if len(col) == digits: return col else: return [0 for _ in range(digits - len(col))] + col @staticmethod def numberToBase(n, b, limit): if b == 1: return [0 if n != _ else 1 for _ in range(limit)] if n == 0: return [0 for _ in range(limit)] digits = [] for _ in range(limit): digits.append(int(n % b)) n, _ = divmod(n, b) return digits[::-1]
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) # first check the type X = util.convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, handle_unknown='value', handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) ordinal_mapping = self.ordinal_encoder.category_mapping mappings_out = [] for switch in ordinal_mapping: values = switch.get('mapping') col = switch.get('col') column_mapping = self.fit_sum_coding(col, values, self.handle_missing, self.handle_unknown) mappings_out.append({'col': switch.get('col'), 'mapping': column_mapping, }) self.mapping = mappings_out X_temp = self.transform(X, override_return_df=True) self.feature_names = X_temp.columns.tolist() # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self
class OneHotEncoder(BaseEstimator, TransformerMixin): """Onehot (or dummy) coding for categorical features, produces one feature per category, each binary. Parameters ---------- verbose: int integer indicating verbosity of output. 0 for none. cols: list a list of columns to encode, if None, all string columns will be encoded drop_invariant: bool boolean for whether or not to drop columns with 0 variance return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array) impute_missing: bool boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future. handle_unknown: str options are 'error', 'ignore' and 'impute', defaults to 'impute', which will impute the category -1. Warning: if impute is used, an extra column will be added in if the transform matrix has unknown categories. This can causes unexpected changes in dimension in some cases. Example ------- >>>from category_encoders import * >>>import pandas as pd >>>from sklearn.datasets import load_boston >>>bunch = load_boston() >>>y = bunch.target >>>X = pd.DataFrame(bunch.data, columns=bunch.feature_names) >>>enc = OneHotEncoder(cols=['CHAS', 'RAD']).fit(X, y) >>>numeric_dataset = enc.transform(X) >>>print(numeric_dataset.info()) <class 'pandas.core.frame.DataFrame'> RangeIndex: 506 entries, 0 to 505 Data columns (total 22 columns): CHAS_0 506 non-null int64 CHAS_1 506 non-null int64 RAD_0 506 non-null int64 RAD_1 506 non-null int64 RAD_2 506 non-null int64 RAD_3 506 non-null int64 RAD_4 506 non-null int64 RAD_5 506 non-null int64 RAD_6 506 non-null int64 RAD_7 506 non-null int64 RAD_8 506 non-null int64 CRIM 506 non-null float64 ZN 506 non-null float64 INDUS 506 non-null float64 NOX 506 non-null float64 RM 506 non-null float64 AGE 506 non-null float64 DIS 506 non-null float64 TAX 506 non-null float64 PTRATIO 506 non-null float64 B 506 non-null float64 LSTAT 506 non-null float64 dtypes: float64(11), int64(11) memory usage: 87.0 KB None References ---------- .. [1] Contrast Coding Systems for categorical variables. UCLA: Statistical Consulting Group. from http://www.ats.ucla.edu/stat/r/library/contrast_coding. .. [2] Gregory Carey (2003). Coding Categorical Variables, from http://psych.colorado.edu/~carey/Courses/PSYC5741/handouts/Coding%20Categorical%20Variables%202006-03-03.pdf """ def __init__( self, verbose=0, cols=None, drop_invariant=False, return_df=True, impute_missing=True, handle_unknown="impute" ): self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] self.verbose = verbose self.cols = cols self.ordinal_encoder = None self._dim = None self.impute_missing = impute_missing self.handle_unknown = handle_unknown def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown ) self.ordinal_encoder = self.ordinal_encoder.fit(X) if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5] return self def transform(self, X): """Perform the transformation to new categorical data. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ if self._dim is None: raise ValueError("Must train encoder before it can be used to transform data.") # first check the type X = convert_input(X) # then make sure that it is the right size if X.shape[1] != self._dim: raise ValueError("Unexpected input dimension %d, expected %d" % (X.shape[1], self._dim)) if not self.cols: return X X = self.ordinal_encoder.transform(X) X = self.get_dummies(X, cols=self.cols) if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) if self.return_df: return X else: return X.values @staticmethod def get_dummies(X_in, cols=None): """ """ X = X_in.copy(deep=True) if cols is None: cols = X.columns.values pass_thru = [] else: pass_thru = [col for col in X.columns.values if col not in cols] bin_cols = [] for col in cols: classes = [x for x in set(X[col].values.tolist()) if np.isfinite(x)] for class_ in classes: n_col_name = str(col) + "_%s" % (class_,) X[n_col_name] = X[col] == class_ bin_cols.append(n_col_name) X = X.reindex(columns=bin_cols + pass_thru) # convert all of the bools into integers. for col in bin_cols: X[col] = X[col].astype(int) return X
class JamesSteinEncoder(BaseEstimator, TransformerMixin): """James-Stein estimator. For feature value i, James-Stein estimator returns a weighted average of: 1) The mean target value for the observed feature value i. 2) The mean target value (regardless of the feature value). This can be written as: JS_i = (1-B)*mean(y_i) + B*mean(y) The question is, what should be the weight B? If we put too much weight on the conditional mean value, we will overfit. If we put too much weight on the global mean, we will underfit. The canonical solution in machine learning is to perform cross-validation. However, Charles Stein came with a closed-form solution to the problem. The intuition is: If the estimate of mean(y_i) is unreliable (y_i has high variance), we should put more weight on mean(y). Stein put it into an equation as: B = var(y_i) / (var(y_i)+var(y)) The only remaining issue is that we do not know var(y), let alone var(y_i). Hence, we have to estimate the variances. But how can we reliably estimate the variances, when we already struggle with the estimation of the mean values?! There are multiple solutions: 1) If we have the same count of observations for each feature value i and all y_i are close to each other, we can pretend that all var(y_i) are identical. This is called a pooled model. 2) If the observation counts are not equal, it makes sense to replace the variances with squared standard errors, which penalize small observation counts: SE^2 = var(y)/count(y) This is called an independent model. James-Stein estimator has, however, one practical limitation - it was defined only for normal distributions. If you want to apply it for binary classification, which allows only values {0, 1}, it is better to first convert the mean target value from the bound interval <0,1> into an unbounded interval by replacing mean(y) with log-odds ratio: log-odds_ratio_i = log(mean(y_i)/mean(y_not_i)) This is called binary model. The estimation of parameters of this model is, however, tricky and sometimes it fails fatally. In these situations, it is better to use beta model, which generally delivers slightly worse accuracy than binary model but does not suffer from fatal failures. Parameters ---------- verbose: int integer indicating verbosity of output. 0 for none. cols: list a list of columns to encode, if None, all string columns will be encoded. drop_invariant: bool boolean for whether or not to drop encoded columns with 0 variance. return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). handle_missing: str options are 'return_nan', 'error' and 'value', defaults to 'value', which returns the prior probability. handle_unknown: str options are 'return_nan', 'error' and 'value', defaults to 'value', which returns the prior probability. model: str options are 'pooled', 'beta', 'binary' and 'independent', defaults to 'independent'. randomized: bool, adds normal (Gaussian) distribution noise into training data in order to decrease overfitting (testing data are untouched). sigma: float standard deviation (spread or "width") of the normal distribution. Example ------- >>> from category_encoders import * >>> import pandas as pd >>> from sklearn.datasets import load_boston >>> bunch = load_boston() >>> y = bunch.target >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names) >>> enc = JamesSteinEncoder(cols=['CHAS', 'RAD']).fit(X, y) >>> numeric_dataset = enc.transform(X) >>> print(numeric_dataset.info()) <class 'pandas.core.frame.DataFrame'> RangeIndex: 506 entries, 0 to 505 Data columns (total 13 columns): CRIM 506 non-null float64 ZN 506 non-null float64 INDUS 506 non-null float64 CHAS 506 non-null float64 NOX 506 non-null float64 RM 506 non-null float64 AGE 506 non-null float64 DIS 506 non-null float64 RAD 506 non-null float64 TAX 506 non-null float64 PTRATIO 506 non-null float64 B 506 non-null float64 LSTAT 506 non-null float64 dtypes: float64(13) memory usage: 51.5 KB None References ---------- .. [1] Parametric empirical Bayes inference: Theory and applications, equations 1.19 & 1.20, from https://www.jstor.org/stable/2287098 .. [2] Empirical Bayes for multiple sample sizes, from http://chris-said.io/2017/05/03/empirical-bayes-for-multiple-sample-sizes/ .. [3] Shrinkage Estimation of Log-odds Ratios for Comparing Mobility Tables, from https://journals.sagepub.com/doi/abs/10.1177/0081175015570097 .. [4] Stein's paradox and group rationality, from www.philos.rug.nl/~romeyn/presentation/2017_romeijn_-_Paris_Stein.pdf .. [5] Stein's Paradox in Statistics, from http://statweb.stanford.edu/~ckirby/brad/other/Article1977.pdf """ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, handle_unknown='value', handle_missing='value', model='independent', random_state=None, randomized=False, sigma=0.05): self.verbose = verbose self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] self.cols = cols self.ordinal_encoder = None self._dim = None self.mapping = None self.handle_unknown = handle_unknown self.handle_missing = handle_missing self.random_state = random_state self.randomized = randomized self.sigma = sigma self.model = model self.feature_names = None # noinspection PyUnusedLocal def fit(self, X, y, **kwargs): """Fit encoder according to X and binary y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Binary target values. Returns ------- self : encoder Returns self. """ # Unite parameters into pandas types X = util.convert_input(X) y = util.convert_input_vector(y, X.index).astype(float) # The lengths must be equal if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") self._dim = X.shape[1] # If columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, handle_unknown='value', handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) X_ordinal = self.ordinal_encoder.transform(X) # Training if self.model == 'independent': self.mapping = self._train_independent(X_ordinal, y) elif self.model == 'pooled': self.mapping = self._train_pooled(X_ordinal, y) elif self.model == 'beta': self.mapping = self._train_beta(X_ordinal, y) elif self.model == 'binary': # The label must be binary with values {0,1} unique = y.unique() if len(unique) != 2: raise ValueError("The target column y must be binary. But the target contains " + str(len(unique)) + " unique value(s).") if y.isnull().any(): raise ValueError("The target column y must not contain missing values.") if np.max(unique) < 1: raise ValueError("The target column y must be binary with values {0, 1}. Value 1 was not found in the target.") if np.min(unique) > 0: raise ValueError("The target column y must be binary with values {0, 1}. Value 0 was not found in the target.") # Perform the training self.mapping = self._train_log_odds_ratio(X_ordinal, y) else: raise ValueError("model='" + str(self.model) + "' is not a recognized option") X_temp = self.transform(X, override_return_df=True) self.feature_names = X_temp.columns.tolist() # Store column names with approximately constant variance on the training data if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self def transform(self, X, y=None, override_return_df=False): """Perform the transformation to new categorical data. When the data are used for model training, it is important to also pass the target in order to apply leave one out. Parameters ---------- X : array-like, shape = [n_samples, n_features] y : array-like, shape = [n_samples] when transform by leave one out None, when transform without target information (such as transform test set) Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') # Unite the input into pandas types X = util.convert_input(X) # Then make sure that it is the right size if X.shape[1] != self._dim: raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,)) # If we are encoding the training data, we have to check the target if y is not None: y = util.convert_input_vector(y, X.index).astype(float) if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") if not self.cols: return X # Do not modify the input argument X = X.copy(deep=True) X = self.ordinal_encoder.transform(X) if self.handle_unknown == 'error': if X[self.cols].isin([-1]).any().any(): raise ValueError('Unexpected categories found in dataframe') # Loop over columns and replace nominal values with WOE X = self._score(X, y) # Postprocessing # Note: We should not even convert these columns. if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) if self.return_df or override_return_df: return X else: return X.values def fit_transform(self, X, y=None, **fit_params): """ Encoders that utilize the target must make sure that the training data are transformed with: transform(X, y) and not with: transform(X) """ return self.fit(X, y, **fit_params).transform(X, y) def _train_pooled(self, X, y): # Implemented based on reference [1] # Initialize the output mapping = {} # Calculate global statistics prior = y.mean() target_var = y.var() global_count = len(y) for switch in self.ordinal_encoder.category_mapping: col = switch.get('col') values = switch.get('mapping') # Calculate sum and count of the target for each unique value in the feature col stats = y.groupby(X[col]).agg(['mean', 'count']) # See: Computer Age Statistical Inference: Algorithms, Evidence, and Data Science (Bradley Efron & Trevor Hastie, 2016) # Equations 7.19 and 7.20. # Note: The equations assume normal distribution of the label. But our label is p(y|x), # which is definitely not normally distributed as probabilities are bound to lie on interval 0..1. # We make this approximation because Efron does it as well. # Equation 7.19 # Explanation of the equation: # https://stats.stackexchange.com/questions/191444/variance-in-estimating-p-for-a-binomial-distribution # if stats['count'].var() > 0: # warnings.warn('The pooled model assumes that each category is observed exactly N times. This was violated in "' + str(col) +'" column. Consider comparing the accuracy of this model to "independent" model.') # This is a parametric estimate of var(p) in the binomial distribution. # We do not use it because we also want to support non-binary targets. # The difference in the estimates is small. # variance = prior * (1 - prior) / stats['count'].mean() # This is a squared estimate of standard error of the mean: # https://en.wikipedia.org/wiki/Standard_error variance = target_var/(stats['count'].mean()) # Equation 7.20 SSE = ((stats['mean']-prior)**2).sum() # Sum of Squared Errors if SSE > 0: # We have to avoid division by zero B = ((len(stats['count'])-3)*variance) / SSE B = B.clip(0,1) estimate = prior + (1 - B) * (stats['mean'] - prior) else: estimate = stats['mean'] # Ignore unique values. This helps to prevent overfitting on id-like columns # This works better than: estimate[stats['count'] == 1] = prior if len(stats['mean'])==global_count: estimate[:] = prior if self.handle_unknown == 'return_nan': estimate.loc[-1] = np.nan elif self.handle_unknown == 'value': estimate.loc[-1] = prior if self.handle_missing == 'return_nan': estimate.loc[values.loc[np.nan]] = np.nan elif self.handle_missing == 'value': estimate.loc[-2] = prior # Store the estimate for transform() function mapping[col] = estimate return mapping def _train_independent(self, X, y): # Implemented based on reference [2] # Initialize the output mapping = {} # Calculate global statistics prior = y.mean() global_count = len(y) global_var = y.var() for switch in self.ordinal_encoder.category_mapping: col = switch.get('col') values = switch.get('mapping') # Calculate sum and count of the target for each unique value in the feature col stats = y.groupby(X[col]).agg(['mean', 'var']) i_var = stats['var'].fillna(0) # When we do not have more than 1 sample, assume 0 variance unique_cnt = len(X[col].unique()) # See: Parametric Empirical Bayes Inference: Theory and Applications (Morris, 1983) # Equations 1.19 and 1.20. # Note: The equations assume normal distribution of the label. But our label is p(y|x), # which is definitely not normally distributed as probabilities are bound to lie on interval 0..1. # Nevertheless, it seems to perform surprisingly well. This is in agreement with: # Data Analysis with Stein's Estimator and Its Generalizations (Efron & Morris, 1975) # The equations are similar to James-Stein estimator, as listed in: # Stein's Paradox in Statistics (Efron & Morris, 1977) # Or: # Computer Age Statistical Inference: Algorithms, Evidence, and Data Science (Efron & Hastie, 2016) # Equations 7.19 and 7.20. # The difference is that they have equal count of observations per estimated variable, while we generally # do not have that. Nice discussion about that is given at: # http://chris-said.io/2017/05/03/empirical-bayes-for-multiple-sample-sizes/ smoothing = i_var / (global_var + i_var) * (unique_cnt-3) / (unique_cnt-1) smoothing = 1 - smoothing smoothing = smoothing.clip(lower=0, upper=1) # Smoothing should be in the interval <0,1> estimate = smoothing*(stats['mean']) + (1-smoothing)*prior # Ignore unique values. This helps to prevent overfitting on id-like columns if len(stats['mean'])==global_count: estimate[:] = prior if self.handle_unknown == 'return_nan': estimate.loc[-1] = np.nan elif self.handle_unknown == 'value': estimate.loc[-1] = prior if self.handle_missing == 'return_nan': estimate.loc[values.loc[np.nan]] = np.nan elif self.handle_missing == 'value': estimate.loc[-2] = prior # Store the estimate for transform() function mapping[col] = estimate return mapping def _train_log_odds_ratio(self, X, y): # Implemented based on reference [3] # Initialize the output mapping = {} # Calculate global statistics global_sum = y.sum() global_count = y.count() # Iterative estimation of mu and sigma as given on page 9. # This problem is traditionally solved with Newton-Raphson method: # https://en.wikipedia.org/wiki/Newton%27s_method # But we just use sklearn minimizer. def get_best_sigma(sigma, mu_k, sigma_k, K): global mu # Ugly. But I want to be able to read it once the optimization ends. w_k = 1. / (sigma ** 2 + sigma_k ** 2) # Weights depends on sigma mu = sum(w_k * mu_k) / sum(w_k) # Mu transitively depends on sigma total = sum(w_k * (mu_k - mu) ** 2) # We want this to be close to K-1 loss = abs(total - (K - 1)) return loss for switch in self.ordinal_encoder.category_mapping: col = switch.get('col') values = switch.get('mapping') # Calculate sum and count of the target for each unique value in the feature col stats = y.groupby(X[col]).agg(['sum', 'count']) # Count of x_{i,+} and x_i # Create 2x2 contingency table crosstable = pd.DataFrame() crosstable['E-A-'] = global_count - stats['count'] + stats['sum'] - global_sum crosstable['E-A+'] = stats['count'] - stats['sum'] crosstable['E+A-'] = global_sum - stats['sum'] crosstable['E+A+'] = stats['sum'] index = crosstable.index.values crosstable = np.array(crosstable, dtype=np.float32) # The argument unites the types into float # Count of contingency tables. K = len(crosstable) # Ignore id-like columns. This helps to prevent overfitting. if K == global_count: estimate = pd.Series(0, index=values) else: if K>1: # We want to avoid division by zero in y_k calculation # Estimate log-odds ratios with Yates correction as listed on page 5. mu_k = np.log((crosstable[:, 0] + 0.5) * (crosstable[:, 3] + 0.5) / ((crosstable[:, 1] + 0.5) * (crosstable[:, 2] + 0.5))) # Standard deviation estimate for 2x2 contingency table as given in equation 2. # The explanation of the equation is given in: # https://stats.stackexchange.com/questions/266098/how-do-i-calculate-the-standard-deviation-of-the-log-odds sigma_k = np.sqrt(np.sum(1. / (crosstable + 0.5), axis=1)) # Estimate the sigma and mu. Sigma is non-negative. result = scipy.optimize.minimize(get_best_sigma, x0=1e-4, args=(mu_k, sigma_k, K), bounds=[(0, np.inf)], method='TNC', tol=1e-12, options={'gtol': 1e-12, 'ftol': 1e-12, 'eps':1e-12}) sigma = result.x[0] # Empirical Bayes follows equation 7. # However, James-Stein estimator behaves perversely when K < 3. Hence, we clip the B into interval <0,1>. # Literature reference for the clipping: # Estimates of Income for Small Places: An Application of James-Stein Procedures to Census Data (Fay & Harriout, 1979), # page 270. B = (K - 3) * sigma_k ** 2 / ((K - 1) * (sigma ** 2 + sigma_k ** 2)) B = B.clip(0,1) y_k = mu + (1 - B) * (mu_k - mu) # Convert Numpy vector back into Series estimate = pd.Series(y_k, index=index) else: estimate = pd.Series(0, index=values) if self.handle_unknown == 'return_nan': estimate.loc[-1] = np.nan elif self.handle_unknown == 'value': estimate.loc[-1] = 0 if self.handle_missing == 'return_nan': estimate.loc[values.loc[np.nan]] = np.nan elif self.handle_missing == 'value': estimate.loc[-2] = 0 # Store the estimate for transform() function mapping[col] = estimate return mapping def _train_beta(self, X, y): # Implemented based on reference [4] # Initialize the output mapping = {} # Calculate global statistics prior = y.mean() global_count = len(y) for switch in self.ordinal_encoder.category_mapping: col = switch.get('col') values = switch.get('mapping') # Calculate sum and count of the target for each unique value in the feature col stats = y.groupby(X[col]).agg(['mean', 'count']) # See: Stein's paradox and group rationality (Romeijn, 2017), page 14 smoothing = stats['count'] / (stats['count'] + global_count) estimate = smoothing*(stats['mean']) + (1-smoothing)*prior # Ignore unique values. This helps to prevent overfitting on id-like columns if len(stats['mean'])==global_count: estimate[:] = prior if self.handle_unknown == 'return_nan': estimate.loc[-1] = np.nan elif self.handle_unknown == 'value': estimate.loc[-1] = prior if self.handle_missing == 'return_nan': estimate.loc[values.loc[np.nan]] = np.nan elif self.handle_missing == 'value': estimate.loc[-2] = prior # Store the estimate for transform() function mapping[col] = estimate return mapping def _score(self, X, y): for col in self.cols: # Score the column X[col] = X[col].map(self.mapping[col]) # Randomization is meaningful only for training data -> we do it only if y is present if self.randomized and y is not None: random_state_generator = check_random_state(self.random_state) X[col] = (X[col] * random_state_generator.normal(1., self.sigma, X[col].shape[0])) return X def get_feature_names(self): """ Returns the names of all transformed / added columns. Returns: -------- feature_names: list A list with all feature names transformed or added. Note: potentially dropped features are not included! """ if not isinstance(self.feature_names, list): raise ValueError("Estimator has to be fitted to return feature names.") else: return self.feature_names
class SumEncoder(BaseEstimator, TransformerMixin): """ """ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True): """ :param verbose: (optional, default=0) integer indicating verbosity of output. 0 for none. :param cols: (optional, default=None) a list of columns to encode, if None, all string columns will be encoded :param drop_invariant: (optional, default=False) boolean for whether or not to drop columns with 0 variance :param return_df: (optional, default=True) boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array) :return: """ self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] self.verbose = verbose self.cols = cols self.ordinal_encoder = None def fit(self, X, y=None, **kwargs): """ :param X: :param y: :param kwargs: :return: """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose, cols=self.cols) self.ordinal_encoder = self.ordinal_encoder.fit(X) # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [ x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5 ] return self def transform(self, X): """ :param X: :return: """ if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) if not self.cols: return X X = self.ordinal_encoder.transform(X) X = sum_coding(X, cols=self.cols) if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) if self.return_df: return X else: return X.values
class WOEEncoder(BaseEstimator, TransformerMixin): """Weight of Evidence coding for categorical features. Parameters ---------- verbose: int integer indicating verbosity of output. 0 for none. cols: list a list of columns to encode, if None, all string columns will be encoded. drop_invariant: bool boolean for whether or not to drop columns with 0 variance. return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). handle_unknown: str options are 'return_nan', 'error' and 'value', defaults to 'value', which will assume WOE=0. randomized: bool, adds normal (Gaussian) distribution noise into training data in order to decrease overfitting (testing data are untouched). sigma: float standard deviation (spread or "width") of the normal distribution. regularization: float the purpose of regularization is mostly to prevent division by zero. When regularization is 0, you may encounter division by zero. Example ------- >>> from category_encoders import * >>> import pandas as pd >>> from sklearn.datasets import load_boston >>> bunch = load_boston() >>> y = bunch.target > 22.5 >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names) >>> enc = WOEEncoder(cols=['CHAS', 'RAD']).fit(X, y) >>> numeric_dataset = enc.transform(X) >>> print(numeric_dataset.info()) <class 'pandas.core.frame.DataFrame'> RangeIndex: 506 entries, 0 to 505 Data columns (total 13 columns): CRIM 506 non-null float64 ZN 506 non-null float64 INDUS 506 non-null float64 CHAS 506 non-null float64 NOX 506 non-null float64 RM 506 non-null float64 AGE 506 non-null float64 DIS 506 non-null float64 RAD 506 non-null float64 TAX 506 non-null float64 PTRATIO 506 non-null float64 B 506 non-null float64 LSTAT 506 non-null float64 dtypes: float64(13) memory usage: 51.5 KB None References ---------- .. [1] Weight of Evidence (WOE) and Information Value Explained. from https://www.listendata.com/2015/03/weight-of-evidence-woe-and-information.html. """ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, handle_unknown='value', handle_missing='value', random_state=None, randomized=False, sigma=0.05, regularization=1.0): self.verbose = verbose self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] self.cols = cols self.ordinal_encoder = None self._dim = None self.mapping = None self.handle_unknown = handle_unknown self.handle_missing = handle_missing self._sum = None self._count = None self.random_state = random_state self.randomized = randomized self.sigma = sigma self.regularization = regularization self.feature_names = None # noinspection PyUnusedLocal def fit(self, X, y, **kwargs): """Fit encoder according to X and binary y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Binary target values. Returns ------- self : encoder Returns self. """ # Unite parameters into pandas types X = util.convert_input(X) y = util.convert_input_vector(y, X.index).astype(float) # The lengths must be equal if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") # The label must be binary with values {0,1} unique = y.unique() if len(unique) != 2: raise ValueError("The target column y must be binary. But the target contains " + str(len(unique)) + " unique value(s).") if y.isnull().any(): raise ValueError("The target column y must not contain missing values.") if np.max(unique) < 1: raise ValueError("The target column y must be binary with values {0, 1}. Value 1 was not found in the target.") if np.min(unique) > 0: raise ValueError("The target column y must be binary with values {0, 1}. Value 0 was not found in the target.") self._dim = X.shape[1] # If columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, handle_unknown='value', handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) X_ordinal = self.ordinal_encoder.transform(X) # Training self.mapping = self._train(X_ordinal, y) X_temp = self.transform(X, override_return_df=True) self.feature_names = X_temp.columns.tolist() # Store column names with approximately constant variance on the training data if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self def transform(self, X, y=None, override_return_df=False): """Perform the transformation to new categorical data. When the data are used for model training, it is important to also pass the target in order to apply leave one out. Parameters ---------- X : array-like, shape = [n_samples, n_features] y : array-like, shape = [n_samples] when transform by leave one out None, when transform without target information (such as transform test set) Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') # Unite the input into pandas types X = util.convert_input(X) # Then make sure that it is the right size if X.shape[1] != self._dim: raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,)) # If we are encoding the training data, we have to check the target if y is not None: y = util.convert_input_vector(y, X.index).astype(float) if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") if not self.cols: return X # Do not modify the input argument X = X.copy(deep=True) X = self.ordinal_encoder.transform(X) if self.handle_unknown == 'error': if X[self.cols].isin([-1]).any().any(): raise ValueError('Unexpected categories found in dataframe') # Loop over columns and replace nominal values with WOE X = self._score(X, y) # Postprocessing # Note: We should not even convert these columns. if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) if self.return_df or override_return_df: return X else: return X.values def fit_transform(self, X, y=None, **fit_params): """ Encoders that utilize the target must make sure that the training data are transformed with: transform(X, y) and not with: transform(X) """ return self.fit(X, y, **fit_params).transform(X, y) def _train(self, X, y): # Initialize the output mapping = {} # Calculate global statistics self._sum = y.sum() self._count = y.count() for switch in self.ordinal_encoder.category_mapping: col = switch.get('col') values = switch.get('mapping') # Calculate sum and count of the target for each unique value in the feature col stats = y.groupby(X[col]).agg(['sum', 'count']) # Count of x_{i,+} and x_i # Create a new column with regularized WOE. # Regularization helps to avoid division by zero. # Pre-calculate WOEs because logarithms are slow. nominator = (stats['sum'] + self.regularization) / (self._sum + 2*self.regularization) denominator = ((stats['count'] - stats['sum']) + self.regularization) / (self._count - self._sum + 2*self.regularization) woe = np.log(nominator / denominator) # Ignore unique values. This helps to prevent overfitting on id-like columns. woe[stats['count'] == 1] = 0 if self.handle_unknown == 'return_nan': woe.loc[-1] = np.nan elif self.handle_unknown == 'value': woe.loc[-1] = 0 if self.handle_missing == 'return_nan': woe.loc[values.loc[np.nan]] = np.nan elif self.handle_missing == 'value': woe.loc[-2] = 0 # Store WOE for transform() function mapping[col] = woe return mapping def _score(self, X, y): for col in self.cols: # Score the column X[col] = X[col].map(self.mapping[col]) # Randomization is meaningful only for training data -> we do it only if y is present if self.randomized and y is not None: random_state_generator = check_random_state(self.random_state) X[col] = (X[col] * random_state_generator.normal(1., self.sigma, X[col].shape[0])) return X def get_feature_names(self): """ Returns the names of all transformed / added columns. Returns: -------- feature_names: list A list with all feature names transformed or added. Note: potentially dropped features are not included! """ if not isinstance(self.feature_names, list): raise ValueError("Estimator has to be fitted to return feature names.") else: return self.feature_names
class BaseNEncoder(BaseEstimator, TransformerMixin): """Base-N encoder encodes the categories into arrays of their base-N representation. A base of 1 is equivalent to one-hot encoding (not really base-1, but useful), a base of 2 is equivalent to binary encoding. N=number of actual categories is equivalent to vanilla ordinal encoding. Parameters ---------- verbose: int integer indicating verbosity of the output. 0 for none. cols: list a list of columns to encode, if None, all string columns will be encoded. drop_invariant: bool boolean for whether or not to drop columns with 0 variance. return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). base: int when the downstream model copes well with nonlinearities (like decision tree), use higher base. handle_unknown: str options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. Warning: if indicator is used, an extra column will be added in if the transform matrix has unknown categories. This can cause unexpected changes in dimension in some cases. handle_missing: str options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. Warning: if indicator is used, an extra column will be added in if the transform matrix has nan values. This can cause unexpected changes in dimension in some cases. Example ------- >>> from category_encoders import * >>> import pandas as pd >>> from sklearn.datasets import load_boston >>> bunch = load_boston() >>> y = bunch.target >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names) >>> enc = BaseNEncoder(cols=['CHAS', 'RAD']).fit(X, y) >>> numeric_dataset = enc.transform(X) >>> print(numeric_dataset.info()) <class 'pandas.core.frame.DataFrame'> RangeIndex: 506 entries, 0 to 505 Data columns (total 18 columns): CRIM 506 non-null float64 ZN 506 non-null float64 INDUS 506 non-null float64 CHAS_0 506 non-null int64 CHAS_1 506 non-null int64 NOX 506 non-null float64 RM 506 non-null float64 AGE 506 non-null float64 DIS 506 non-null float64 RAD_0 506 non-null int64 RAD_1 506 non-null int64 RAD_2 506 non-null int64 RAD_3 506 non-null int64 RAD_4 506 non-null int64 TAX 506 non-null float64 PTRATIO 506 non-null float64 B 506 non-null float64 LSTAT 506 non-null float64 dtypes: float64(11), int64(7) memory usage: 71.2 KB None """ def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, base=2, handle_unknown='value', handle_missing='value'): self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] self.verbose = verbose self.handle_unknown = handle_unknown self.handle_missing = handle_missing self.cols = cols self.mapping = mapping self.ordinal_encoder = None self._dim = None self.base = base self._encoded_columns = None self.feature_names = None def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) X = util.convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, handle_unknown='value', handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) self.mapping = self.fit_base_n_encoding(X) # do a transform on the training data to get a column list X_temp = self.transform(X, override_return_df=True) self._encoded_columns = X_temp.columns.values self.feature_names = list(X_temp.columns) # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self def fit_base_n_encoding(self, X): mappings_out = [] for switch in self.ordinal_encoder.category_mapping: col = switch.get('col') values = switch.get('mapping') if self.handle_missing == 'value': values = values[values > 0] if self.handle_unknown == 'indicator': values = np.append(values, -1) digits = self.calc_required_digits(values) X_unique = pd.DataFrame(index=values, columns=[str(col) + '_%d' % x for x in range(digits)], data=np.array([self.col_transform(x, digits) for x in range(1, len(values) + 1)])) if self.handle_unknown == 'return_nan': X_unique.loc[-1] = np.nan elif self.handle_unknown == 'value': X_unique.loc[-1] = 0 if self.handle_missing == 'return_nan': X_unique.loc[values.loc[np.nan]] = np.nan elif self.handle_missing == 'value': X_unique.loc[-2] = 0 mappings_out.append({'col': col, 'mapping': X_unique}) return mappings_out def transform(self, X, override_return_df=False): """Perform the transformation to new categorical data. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') # first check the type X = util.convert_input(X) # then make sure that it is the right size if X.shape[1] != self._dim: raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,)) if not self.cols: return X X_out = self.ordinal_encoder.transform(X) if self.handle_unknown == 'error': if X_out[self.cols].isin([-1]).any().any(): raise ValueError('Columns to be encoded can not contain new values') X_out = self.basen_encode(X_out, cols=self.cols) if self.drop_invariant: for col in self.drop_cols: X_out.drop(col, 1, inplace=True) # impute missing values only in the generated columns # generated_cols = util.get_generated_cols(X, X_out, self.cols) # X_out[generated_cols] = X_out[generated_cols].fillna(value=0.0) if self.return_df or override_return_df: return X_out else: return X_out.values def inverse_transform(self, X_in): """ Perform the inverse transformation to encoded data. Parameters ---------- X_in : array-like, shape = [n_samples, n_features] Returns ------- p: array, the same size of X_in """ # fail fast if self._dim is None: raise ValueError('Must train encoder before it can be used to inverse_transform data') # unite the type into pandas dataframe (it makes the input size detection code easier...) and make deep copy X = util.convert_input(X_in, columns=self.feature_names, deep=True) X = self.basen_to_integer(X, self.cols, self.base) # make sure that it is the right size if X.shape[1] != self._dim: if self.drop_invariant: raise ValueError("Unexpected input dimension %d, the attribute drop_invariant should " "be False when transforming the data" % (X.shape[1],)) else: raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,)) if not self.cols: return X if self.return_df else X.values for switch in self.ordinal_encoder.mapping: column_mapping = switch.get('mapping') inverse = pd.Series(data=column_mapping.index, index=column_mapping.get_values()) X[switch.get('col')] = X[switch.get('col')].map(inverse).astype(switch.get('data_type')) if self.handle_unknown == 'return_nan' and self.handle_missing == 'return_nan': for col in self.cols: if X[switch.get('col')].isnull().any(): warnings.warn("inverse_transform is not supported because transform impute " "the unknown category nan when encode %s" % (col,)) return X if self.return_df else X.values def calc_required_digits(self, values): # figure out how many digits we need to represent the classes present if self.base == 1: digits = len(values) + 1 else: digits = int(np.ceil(math.log(len(values), self.base))) + 1 return digits def basen_encode(self, X_in, cols=None): """ Basen encoding encodes the integers as basen code with one column per digit. Parameters ---------- X_in: DataFrame cols: list-like, default None Column names in the DataFrame to be encoded Returns ------- dummies : DataFrame """ X = X_in.copy(deep=True) cols = X.columns.values.tolist() for switch in self.mapping: col = switch.get('col') mod = switch.get('mapping') base_df = mod.reindex(X[col]) base_df.set_index(X.index, inplace=True) X = pd.concat([base_df, X], axis=1) old_column_index = cols.index(col) cols[old_column_index: old_column_index + 1] = mod.columns return X.reindex(columns=cols) def basen_to_integer(self, X, cols, base): """ Convert basen code as integers. Parameters ---------- X : DataFrame encoded data cols : list-like Column names in the DataFrame that be encoded base : int The base of transform Returns ------- numerical: DataFrame """ out_cols = X.columns.values.tolist() for col in cols: col_list = [col0 for col0 in out_cols if str(col0).startswith(str(col))] insert_at = out_cols.index(col_list[0]) if base == 1: value_array = np.array([int(col0.split('_')[-1]) for col0 in col_list]) else: len0 = len(col_list) value_array = np.array([base ** (len0 - 1 - i) for i in range(len0)]) X.insert(insert_at, col, np.dot(X[col_list].values, value_array.T)) X.drop(col_list, axis=1, inplace=True) out_cols = X.columns.values.tolist() return X def col_transform(self, col, digits): """ The lambda body to transform the column values """ if col is None or float(col) < 0.0: return None else: col = self.number_to_base(int(col), self.base, digits) if len(col) == digits: return col else: return [0 for _ in range(digits - len(col))] + col @staticmethod def number_to_base(n, b, limit): if b == 1: return [0 if n != _ else 1 for _ in range(limit)] if n == 0: return [0 for _ in range(limit)] digits = [] for _ in range(limit): digits.append(int(n % b)) n, _ = divmod(n, b) return digits[::-1] def get_feature_names(self): """ Returns the names of all transformed / added columns. Returns ------- feature_names: list A list with all feature names transformed or added. Note: potentially dropped features are not included! """ if not isinstance(self.feature_names, list): raise ValueError('Must fit data first. Affected feature names are not known before.') else: return self.feature_names
class HelmertEncoder(BaseEstimator, TransformerMixin): """Helmert contrast coding for encoding categorical features Parameters ---------- verbose: int integer indicating verbosity of output. 0 for none. cols: list a list of columns to encode, if None, all string columns will be encoded drop_invariant: bool boolean for whether or not to drop columns with 0 variance return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array) Example ------- >>>from category_encoders import * >>>import pandas as pd >>>from sklearn.datasets import load_boston >>>bunch = load_boston() >>>y = bunch.target >>>X = pd.DataFrame(bunch.data, columns=bunch.feature_names) >>>enc = HelmertEncoder(cols=['CHAS', 'RAD']).fit(X, y) >>>numeric_dataset = enc.transform(X) >>>print(numeric_dataset.info()) <class 'pandas.core.frame.DataFrame'> RangeIndex: 506 entries, 0 to 505 Data columns (total 22 columns): col_CHAS_0 506 non-null float64 col_CHAS_1 506 non-null float64 col_RAD_0 506 non-null float64 col_RAD_1 506 non-null float64 col_RAD_2 506 non-null float64 col_RAD_3 506 non-null float64 col_RAD_4 506 non-null float64 col_RAD_5 506 non-null float64 col_RAD_6 506 non-null float64 col_RAD_7 506 non-null float64 col_RAD_8 506 non-null float64 col_CRIM 506 non-null float64 col_ZN 506 non-null float64 col_INDUS 506 non-null float64 col_NOX 506 non-null float64 col_RM 506 non-null float64 col_AGE 506 non-null float64 col_DIS 506 non-null float64 col_TAX 506 non-null float64 col_PTRATIO 506 non-null float64 col_B 506 non-null float64 col_LSTAT 506 non-null float64 dtypes: float64(22) memory usage: 87.0 KB None References ---------- .. [1] Contrast Coding Systems for categorical variables. UCLA: Statistical Consulting Group. from http://www.ats.ucla.edu/stat/r/library/contrast_coding. .. [2] Gregory Carey (2003). Coding Categorical Variables, from http://psych.colorado.edu/~carey/Courses/PSYC5741/handouts/Coding%20Categorical%20Variables%202006-03-03.pdf """ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True): self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] self.verbose = verbose self.cols = cols self.ordinal_encoder = None self._dim = None def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose, cols=self.cols) self.ordinal_encoder = self.ordinal_encoder.fit(X) if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5] return self def transform(self, X): """Perform the transformation to new categorical data. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') # first check the type X = convert_input(X) # then make sure that it is the right size if X.shape[1] != self._dim: raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim, )) if not self.cols: return X X = self.ordinal_encoder.transform(X) X = self.helmert_coding(X, cols=self.cols) if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) if self.return_df: return X else: return X.values @staticmethod def helmert_coding(X_in, cols=None): """ """ X = X_in.copy(deep=True) X.columns = ['col_' + str(x) for x in X.columns.values] cols = ['col_' + str(x) for x in cols] if cols is None: cols = X.columns.values pass_thru = [] else: pass_thru = [col for col in X.columns.values if col not in cols] bin_cols = [] for col in cols: mod = dmatrix("C(%s, Helmert)" % (col, ), X) for dig in range(len(mod[0])): X[str(col) + '_%d' % (dig, )] = mod[:, dig] bin_cols.append(str(col) + '_%d' % (dig, )) X = X.reindex(columns=bin_cols + pass_thru) return X
class BaseNEncoder(BaseEstimator, TransformerMixin): """Base-N encoder encodes the categories into arrays of their base-N representation. A base of 1 is equivalent to one-hot encoding (not really base-1, but useful), a base of 2 is equivalent to binary encoding. N=number of actual categories is equivalent to vanilla ordinal encoding. Parameters ---------- verbose: int integer indicating verbosity of output. 0 for none. cols: list a list of columns to encode, if None, all string columns will be encoded drop_invariant: bool boolean for whether or not to drop columns with 0 variance return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array) impute_missing: bool boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future. handle_unknown: str options are 'error', 'ignore' and 'impute', defaults to 'impute', which will impute the category -1. Warning: if impute is used, an extra column will be added in if the transform matrix has unknown categories. This can causes unexpected changes in dimension in some cases. Example ------- >>>from category_encoders import * >>>import pandas as pd >>>from sklearn.datasets import load_boston >>>bunch = load_boston() >>>y = bunch.target >>>X = pd.DataFrame(bunch.data, columns=bunch.feature_names) >>>enc = BaseNEncoder(cols=['CHAS', 'RAD']).fit(X, y) >>>numeric_dataset = enc.transform(X) >>>print(numeric_dataset.info()) <class 'pandas.core.frame.DataFrame'> RangeIndex: 506 entries, 0 to 505 Data columns (total 16 columns): CHAS_0 506 non-null int64 RAD_0 506 non-null int64 RAD_1 506 non-null int64 RAD_2 506 non-null int64 RAD_3 506 non-null int64 CRIM 506 non-null float64 ZN 506 non-null float64 INDUS 506 non-null float64 NOX 506 non-null float64 RM 506 non-null float64 AGE 506 non-null float64 DIS 506 non-null float64 TAX 506 non-null float64 PTRATIO 506 non-null float64 B 506 non-null float64 LSTAT 506 non-null float64 dtypes: float64(11), int64(5) memory usage: 63.3 KB None """ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, base=2, impute_missing=True, handle_unknown='impute'): self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] self.verbose = verbose self.impute_missing = impute_missing self.handle_unknown = handle_unknown self.cols = cols self.ordinal_encoder = None self._dim = None self.base = base self._encoded_columns = None self.digits_per_col = {} def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) # first check the type X = convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown) self.ordinal_encoder = self.ordinal_encoder.fit(X) for col in self.cols: self.digits_per_col[col] = self.calc_required_digits(X, col) # do a transform on the training data to get a column list X_t = self.transform(X, override_return_df=True) self._encoded_columns = X_t.columns.values # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [ x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5 ] return self def transform(self, X, override_return_df=False): """Perform the transformation to new categorical data. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ if self._dim is None: raise ValueError( 'Must train encoder before it can be used to transform data.') # first check the type X = convert_input(X) # then make sure that it is the right size if X.shape[1] != self._dim: raise ValueError('Unexpected input dimension %d, expected %d' % ( X.shape[1], self._dim, )) if not self.cols: return X X = self.ordinal_encoder.transform(X) X = self.basen_encode(X, cols=self.cols) if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) X.fillna(0.0, inplace=True) if self.return_df or override_return_df: return X else: return X.values def inverse_transform(self, Xt): """ Perform the inverse transformation to encoded data. Parameters ---------- X_in : array-like, shape = [n_samples, n_features] Returns ------- p: array, the same size of X_in """ warnings.warn( 'Inverse transform in basen is a currently experimental feature, please be careful' ) X = Xt.copy(deep=True) # first check the type X = convert_input(X) if self._dim is None: raise ValueError( 'Must train encoder before it can be used to inverse_transform data' ) X = self.basen_to_interger(X, self.cols, self.base) # then make sure that it is the right size if X.shape[1] != self._dim: if self.drop_invariant: raise ValueError( "Unexpected input dimension %d, the attribute drop_invariant should " "set as False when transform data" % (X.shape[1], )) else: raise ValueError('Unexpected input dimension %d, expected %d' % ( X.shape[1], self._dim, )) if not self.cols: return X if self.return_df else X.values if self.impute_missing and self.handle_unknown == 'impute': for col in self.cols: if any(X[col] == -1): raise ValueError( "inverse_transform is not supported because transform impute " "the unknown category -1 when encode %s" % (col, )) for switch in self.ordinal_encoder.mapping: col_dict = { col_pair[1]: col_pair[0] for col_pair in switch.get('mapping') } X[switch.get('col')] = X[switch.get('col')].apply( lambda x: col_dict.get(x)) return X if self.return_df else X.values def calc_required_digits(self, X, col): # figure out how many digits we need to represent the classes present if self.base == 1: digits = len(X[col].unique()) + 1 else: digits = int(np.ceil(math.log(len(X[col].unique()), self.base))) + 1 return digits def basen_encode(self, X_in, cols=None): """ Basen encoding encodes the integers as basen code with one column per digit. Parameters ---------- X_in: DataFrame cols: list-like, default None Column names in the DataFrame to be encoded Returns ------- dummies : DataFrame """ X = X_in.copy(deep=True) if cols is None: cols = X.columns.values pass_thru = [] else: pass_thru = [col for col in X.columns.values if col not in cols] bin_cols = [] for col in cols: # get how many digits we need to represent the classes present digits = self.calc_required_digits(X, col) # map the ordinal column into a list of these digits, of length digits X[col] = X[col].map(lambda x: self.col_transform(x, digits)) for dig in range(digits): X[str(col) + '_%d' % (dig, )] = X[col].map( lambda r: int(r[dig]) if r is not None else None) bin_cols.append(str(col) + '_%d' % (dig, )) if self._encoded_columns is None: X = X.reindex(columns=bin_cols + pass_thru) else: X = X.reindex(columns=self._encoded_columns) return X def basen_to_interger(self, X, cols, base): """ Convert basen code as integers. Parameters ---------- X : DataFrame encoded data cols : list-like Column names in the DataFrame that be encoded base : int The base of transform Returns ------- numerical: DataFrame """ out_cols = X.columns.values for col in cols: col_list = [col0 for col0 in out_cols if col0.startswith(col)] for col0 in col_list: if any(X[col0].isnull()): raise ValueError( "inverse_transform is not supported because transform impute" "the unknown category -1 when encode %s" % (col, )) if base == 1: value_array = np.array( [int(col0.split('_')[1]) for col0 in col_list]) else: len0 = len(col_list) value_array = np.array( [base**(len0 - 1 - i) for i in range(len0)]) X[col] = np.dot(X[col_list].values, value_array.T) out_cols = [col0 for col0 in out_cols if col0 not in col_list] X = X.reindex(columns=out_cols + cols) return X def col_transform(self, col, digits): """ The lambda body to transform the column values """ if col is None or float(col) < 0.0: return None else: col = self.numberToBase(int(col), self.base, digits) if len(col) == digits: return col else: return [0 for _ in range(digits - len(col))] + col @staticmethod def numberToBase(n, b, limit): if b == 1: return [0 if n != _ else 1 for _ in range(limit)] if n == 0: return [0 for _ in range(limit)] digits = [] for _ in range(limit): digits.append(int(n % b)) n, _ = divmod(n, b) return digits[::-1]
class BackwardDifferenceEncoder(BaseEstimator, TransformerMixin): """ """ def __init__(self, verbose=0, cols=None, drop_invariant=False): """ :param verbose: :param cols: :return: """ self.drop_invariant = drop_invariant self.drop_cols = [] self.verbose = verbose self.cols = cols self.ordinal_encoder = OrdinalEncoder(verbose=verbose, cols=cols) def fit(self, X, y=None, **kwargs): """ Fits an ordinal encoder to produce a consistent mapping across applications and optionally finds generally invariant columns to drop consistently. :param X: :param y: :param kwargs: :return: """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) # train an ordinal pre-encoder self.ordinal_encoder = self.ordinal_encoder.fit(X) # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5] return self def transform(self, X): """ :param X: :return: """ if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) if self.cols == []: return X X = self.ordinal_encoder.transform(X) X = backward_difference_coding(X, cols=self.cols) if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) return X