def fit(self, X, y, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # unite the input into pandas types X = util.convert_input(X) y = util.convert_input_vector(y, X.index).astype(float) if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.use_default_cols: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') categories = self.fit_leave_one_out( X, y, cols=self.cols ) self.mapping = categories X_temp = self.transform(X, override_return_df=True) self.feature_names = X_temp.columns.tolist() if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self
def fit(self, X, y=None, **kwargs): """ :param X: :param y: :param kwargs: :return: """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose, cols=self.cols) self.ordinal_encoder = self.ordinal_encoder.fit(X) # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [ x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5 ] return self
def fit(self, X, y=None, **kwargs): """ Fits an ordinal encoder to produce a consistent mapping across applications and optionally finds generally invariant columns to drop consistently. :param X: :param y: :param kwargs: :return: """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) # train an ordinal pre-encoder self.ordinal_encoder = self.ordinal_encoder.fit(X) # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [ x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5 ] return self
def fit(self, X, y=None, **kwargs): """ Fits an ordinal encoder to produce a consistent mapping across applications and optionally finds generally invariant columns to drop consistently. :param X: :param y: :param kwargs: :return: """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) # train an ordinal pre-encoder self.ordinal_encoder = self.ordinal_encoder.fit(X) # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5] return self
def fit(self, X, y, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = util.convert_input(X) if isinstance(y, pd.DataFrame): y = y.iloc[:, 0] else: y = pd.Series(y, name='target') if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) _, self.mapping = self.target_encode( X, y, mapping=None, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown, smoothing_in=self.smoothing, min_samples_leaf=self.min_samples_leaf) X_temp = self.transform(X, override_return_df=True) self.feature_names = list(X_temp.columns) if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self
def fit(self, X, y=None, **kwargs): # first check the type X = util.convert_input(X) self._dim = X.shape[1] self.start_time = time() # if columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) if len(self.cols) == 0: self.keep_going = True return self self.ordinal_encoder = OrdinalEncoder(dtype=np.int) # 1. use sklearn's OrdinalEncoder convert categories to int self.ordinal_encoder.fit(X[self.cols]) X_ordinal = self.ordinal_encoder.transform(X[self.cols]) n_uniques = X_ordinal.max(axis=0).astype("int") + 1 if self.normalize and type_of_target(y) != "continuous": self.normalize = False if self.normalize: y = self.scaler.fit_transform(y[:, None]).flatten() # 2. train_entity_embedding_nn self.model = self.trainer.train( self.model, EntityEmbeddingNN, X_ordinal, y, None, None, self.callback, n_uniques=n_uniques ) return self
def fit(self, X, y=None, **kwargs): """ Fit doesn't actually do anything in this case. So the same object is just returned as-is. :param X: :param y: :param kwargs: :return: """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) _, categories = ordinal_encoding(X, mapping=self.mapping, cols=self.cols, impute_missing=self.impute_missing) self.mapping = categories # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5] return self
def fit(self, X, y, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # unite the input into pandas types X = util.convert_input(X) y = util.convert_input_vector(y, X.index).astype(float) if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.use_default_cols: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') categories = self.fit_leave_one_out(X, y, cols=self.cols) self.mapping = categories X_temp = self.transform(X, override_return_df=True) self.feature_names = X_temp.columns.tolist() if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self
def fit(self, X, y=None, **kwargs): """ :param X: :param y: :param kwargs: :return: """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose, cols=self.cols) self.ordinal_encoder = self.ordinal_encoder.fit(X) # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5] return self
def fit(self, X, y=None, **kwargs): # first check the type X = util.convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) self.ordinal_encoder = OrdinalEncoder(dtype=np.int) # 1. use sklearn's OrdinalEncoder convert categories to int self.ordinal_encoder.fit(X[self.cols]) X_ordinal = self.ordinal_encoder.transform(X[self.cols]) # 2. train_entity_embedding_nn self.entity_embedding_nn = train_entity_embedding_nn( X_ordinal, y, lr=self.lr, epoch=self.epoch, nn_params={ "A": self.A, "B": self.B, "dropout1": self.dropout1, "dropout2": self.dropout2, } ) return self
def fit(self, X, y=None, **kwargs): """ Fit doesn't actually do anything in this case. So the same object is just returned as-is. :param X: :param y: :param kwargs: :return: """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) _, categories = ordinal_encoding(X, mapping=self.mapping, cols=self.cols) self.mapping = categories # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [ x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5 ] return self
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = util.convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, handle_unknown='value', handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) self.mapping = self.generate_mapping() X_temp = self.transform(X, override_return_df=True) self.feature_names = list(X_temp.columns) if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown) self.ordinal_encoder = self.ordinal_encoder.fit(X) ordinal_mapping = self.ordinal_encoder.category_mapping mappings_out = [] for switch in ordinal_mapping: values = [x[1] for x in switch.get('mapping')] column_mapping = self.fit_helmert_coding(values) mappings_out.append({ 'col': switch.get('col'), 'mapping': column_mapping, }) self.mapping = mappings_out if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) generated_cols = get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] return self
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = util.convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) _, categories = self.ordinal_encoding( X, mapping=self.mapping, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown) self.mapping = categories X_temp = self.transform(X, override_return_df=True) self.feature_names = X_temp.columns.tolist() # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self
def fit(self, X, y=None, **kwargs): """Fits an ordinal encoder to produce a consistent mapping across applications and optionally finds generally invariant columns to drop consistently. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) # first check the type X = convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown ) self.ordinal_encoder = self.ordinal_encoder.fit(X) ordinal_mapping = self.ordinal_encoder.category_mapping mappings_out = [] for switch in ordinal_mapping: values = [x[1] for x in switch.get('mapping')] column_mapping = self.fit_backward_difference_coding(values) mappings_out.append({'col': switch.get('col'), 'mapping': column_mapping, }) self.mapping = mappings_out # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) generated_cols = get_generated_cols(X, X_temp, self.cols) self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5] return self
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) # first check the type X = util.convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown) self.ordinal_encoder = self.ordinal_encoder.fit(X) for col in self.cols: self.digits_per_col[col] = self.calc_required_digits(X, col) # do a transform on the training data to get a column list X_t = self.transform(X, override_return_df=True) self._encoded_columns = X_t.columns.values # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] return self
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) # first check the type X = convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown ) self.ordinal_encoder = self.ordinal_encoder.fit(X) for col in self.cols: self.digits_per_col[col] = self.calc_required_digits(X, col) # do a transform on the training data to get a column list X_t = self.transform(X, override_return_df=True) self._encoded_columns = X_t.columns.values # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5] return self
def fit(self, X, y=None, **kwargs): """Fit encoder according to X. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = util.convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) self._check_set_create_dict_attrs() self._fit_count_encode(X, y) X_temp = self.transform(X, override_return_df=True) self.feature_names = list(X_temp.columns) if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self
def fit(self, X, y, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = convert_input(X) if isinstance(y, pd.DataFrame): y = y.iloc[:, 0] else: y = pd.Series(y, name='target') if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) _, categories = self.target_encode( X, y, mapping=self.mapping, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown, smoothing_in=self.smoothing, min_samples_leaf=self.min_samples_leaf) self.mapping = categories if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) generated_cols = get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] return self
def fit(self, X, y, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = util.convert_input(X) if isinstance(y, pd.DataFrame): y = y.iloc[:, 0].astype(float) else: y = pd.Series(y, name='target', dtype=float) if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.use_default_cols: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) categories = self.fit_leave_one_out(X, y, cols=self.cols) self.mapping = categories if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] return self
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = util.convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) _, categories = self.ordinal_encoding( X, mapping=self.mapping, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown) self.mapping = categories # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] return self
def fit(self, X, y, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = convert_input(X) y = pd.Series(y, name='target') assert X.shape[0] == y.shape[0] self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) self.random_state_ = check_random_state(self.random_state) _, categories = self.leave_one_out(X, y, mapping=self.mapping, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown) self.mapping = categories if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [ x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5 ] return self
def fit(self, X, y=None, **kwargs): """Fits an ordinal encoder to produce a consistent mapping across applications and optionally finds generally invariant columns to drop consistently. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) # first check the type if not isinstance(X, pd.DataFrame): if isinstance(X, list): X = pd.DataFrame(np.array(X)) elif isinstance(X, (np.generic, np.ndarray)): X = pd.DataFrame(X) else: raise ValueError('Unexpected input type: %s' % (str(type(X)))) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose, cols=self.cols) self.ordinal_encoder = self.ordinal_encoder.fit(X) # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5] return self
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = util.convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) X_temp = self.transform(X, override_return_df=True) self.feature_names = X_temp.columns.tolist() # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) # first check the type X = convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown) self.ordinal_encoder = self.ordinal_encoder.fit(X) # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [ x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5 ] return self
def fit(self, X, y, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = convert_input(X) y = pd.Series(y, name='target') assert X.shape[0] == y.shape[0] self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) self.random_state_ = check_random_state(self.random_state) _, categories = self.leave_one_out( X, y, mapping=self.mapping, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown ) self.mapping = categories if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5] return self
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) _, categories = self.ordinal_encoding( X, mapping=self.mapping, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown ) self.mapping = categories # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5] return self
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type if not isinstance(X, pd.DataFrame): if isinstance(X, list): X = pd.DataFrame(np.array(X)) elif isinstance(X, (np.generic, np.ndarray)): X = pd.DataFrame(X) else: raise ValueError('Unexpected input type: %s' % (str(type(X)))) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) _, categories = self.ordinal_encoding(X, mapping=self.mapping, cols=self.cols, impute_missing=self.impute_missing) self.mapping = categories # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5] return self
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) # first check the type X = convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose, cols=self.cols) self.ordinal_encoder = self.ordinal_encoder.fit(X) # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5] return self
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose, cols=self.cols) self.ordinal_encoder = self.ordinal_encoder.fit(X) if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [ x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5 ] return self
def fit(self, X, y=None, **kwargs): """Fit encoder according to X. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = util.convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) self._check_set_create_dict_attrs() self._fit_count_encode(X, y) if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] return self
def fit(self, X, y=None, **kwargs): """ :param X: :param y: :param kwargs: :return: """ # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose, cols=self.cols) self.ordinal_encoder = self.ordinal_encoder.fit(X) if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5] return self
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) # first check the type X = util.convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown) X = X.drop_duplicates(subset=self.cols) if self.cols else X self.ordinal_encoder = self.ordinal_encoder.fit(X) for col in self.cols: self.digits_per_col[col] = self.calc_required_digits(X, col) X_temp = self.transform(X, override_return_df=True) self.feature_names = X_temp.columns.tolist() # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self
def fit(self, X, y=None, **kwargs): """Fits an ordinal encoder to produce a consistent mapping across applications and optionally finds generally invariant columns to drop consistently. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) # first check the type X = util.convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown) self.ordinal_encoder = self.ordinal_encoder.fit(X) ordinal_mapping = self.ordinal_encoder.category_mapping mappings_out = [] for switch in ordinal_mapping: values = switch.get('mapping').get_values() column_mapping = self.fit_backward_difference_coding(values) mappings_out.append({ 'col': switch.get('col'), 'mapping': column_mapping, }) self.mapping = mappings_out X_temp = self.transform(X, override_return_df=True) self.feature_names = X_temp.columns.tolist() # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self
def fit(self, X, y, **kwargs): """Fit encoder according to X and binary y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Binary target values. Returns ------- self : encoder Returns self. """ # Unite parameters into pandas types X = util.convert_input(X) y = util.convert_input_vector(y, X.index).astype(float) # The lengths must be equal if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") self._dim = X.shape[1] # If columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) if self.handle_missing == 'error': if X[self.cols].isnull().any().any(): raise ValueError('Columns to be encoded can not contain null') self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose, cols=self.cols, handle_unknown='value', handle_missing='value') self.ordinal_encoder = self.ordinal_encoder.fit(X) X_ordinal = self.ordinal_encoder.transform(X) # Training self.mapping = self._train(X_ordinal, y) X_temp = self.transform(X, override_return_df=True) self.feature_names = X_temp.columns.tolist() # Store column names with approximately constant variance on the training data if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) # first check the type X = util.convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose, cols=self.cols, handle_unknown='value', handle_missing='value') self.ordinal_encoder = self.ordinal_encoder.fit(X) ordinal_mapping = self.ordinal_encoder.category_mapping mappings_out = [] for switch in ordinal_mapping: values = switch.get('mapping') col = switch.get('col') column_mapping = self.fit_sum_coding(col, values, self.handle_missing, self.handle_unknown) mappings_out.append({ 'col': switch.get('col'), 'mapping': column_mapping, }) self.mapping = mappings_out X_temp = self.transform(X, override_return_df=True) self.feature_names = X_temp.columns.tolist() # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self
def fit(self, X, y, **kwargs): """Fit encoder according to X and binary y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Binary target values. Returns ------- self : encoder Returns self. """ # Unite parameters into pandas types X = util.convert_input(X) y = util.convert_input_vector(y, X.index).astype(float) # The lengths must be equal if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") self._dim = X.shape[1] # If columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose, cols=self.cols, handle_unknown='value', handle_missing='value') self.ordinal_encoder = self.ordinal_encoder.fit(X) X_ordinal = self.ordinal_encoder.transform(X) # Training if self.model == 'independent': self.mapping = self._train_independent(X_ordinal, y) elif self.model == 'pooled': self.mapping = self._train_pooled(X_ordinal, y) elif self.model == 'beta': self.mapping = self._train_beta(X_ordinal, y) elif self.model == 'binary': # The label must be binary with values {0,1} unique = y.unique() if len(unique) != 2: raise ValueError( "The target column y must be binary. But the target contains " + str(len(unique)) + " unique value(s).") if y.isnull().any(): raise ValueError( "The target column y must not contain missing values.") if np.max(unique) < 1: raise ValueError( "The target column y must be binary with values {0, 1}. Value 1 was not found in the target." ) if np.min(unique) > 0: raise ValueError( "The target column y must be binary with values {0, 1}. Value 0 was not found in the target." ) # Perform the training self.mapping = self._train_log_odds_ratio(X_ordinal, y) else: raise ValueError("model='" + str(self.model) + "' is not a recognized option") X_temp = self.transform(X, override_return_df=True) self.feature_names = X_temp.columns.tolist() # Store column names with approximately constant variance on the training data if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) # first check the type X = util.convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, handle_unknown='value', handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) ordinal_mapping = self.ordinal_encoder.category_mapping mappings_out = [] for switch in ordinal_mapping: values = switch.get('mapping') col = switch.get('col') column_mapping = self.fit_sum_coding(col, values, self.handle_missing, self.handle_unknown) mappings_out.append({'col': switch.get('col'), 'mapping': column_mapping, }) self.mapping = mappings_out X_temp = self.transform(X, override_return_df=True) self.feature_names = X_temp.columns.tolist() # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self
def fit(self, X, y, **kwargs): """Fit encoder according to X and binary y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Binary target values. Returns ------- self : encoder Returns self. """ # Unite parameters into pandas types X = convert_input(X) if isinstance(y, pd.DataFrame): y = y.iloc[:, 0] else: y = pd.Series(y, name='target') # The lengths must be equal if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") # The label must be binary with values {0,1} unique = y.unique() if len(unique) != 2: raise ValueError( "The target column y must be binary. But the target contains " + str(len(unique)) + " unique value(s).") if y.isnull().any(): raise ValueError( "The target column y must not contain missing values.") if np.max(unique) < 1: raise ValueError( "The target column y must be binary with values {0, 1}. Value 1 was not found in the target." ) if np.min(unique) > 0: raise ValueError( "The target column y must be binary with values {0, 1}. Value 0 was not found in the target." ) self._dim = X.shape[1] # If columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) # Training self.mapping = self._train(X, y, cols=self.cols) # Store column names with approximately constant variance on the training data if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) generated_cols = get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] return self
def fit(self, X, y, **kwargs): """Fit encoder according to X and binary y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Binary target values. Returns ------- self : encoder Returns self. """ # Unite parameters into pandas types X = util.convert_input(X) y = util.convert_input_vector(y, X.index).astype(float) # The lengths must be equal if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") self._dim = X.shape[1] # If columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, handle_unknown='value', handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) X_ordinal = self.ordinal_encoder.transform(X) # Training self.mapping = self._train(X_ordinal, y) X_temp = self.transform(X, override_return_df=True) self.feature_names = X_temp.columns.tolist() # Store column names with approximately constant variance on the training data if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self
def fit(self, X, y, **kwargs): """Fit encoder according to X and binary y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Binary target values. Returns ------- self : encoder Returns self. """ # Create parent encoder and fit it self.parent_cols = list(self.feature_mapping.values()) self.parent_encoder = MEstimateEncoder( verbose=self.verbose, cols=self.parent_cols, drop_invariant=self.drop_invariant, return_df=self.return_df, handle_unknown=self.handle_unknown, handle_missing=self.handle_missing, random_state=self.random_state, randomized=self.randomized, sigma=self.sigma, m=self.m_prior, ) self.parent_encoder.fit(X, y) # Unite parameters into pandas types X = util.convert_input(X) y = util.convert_input_vector(y, X.index).astype(float) # The lengths must be equal if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") self._dim = X.shape[1] # If columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) if self.handle_missing == "error": if X[self.cols].isnull().any().any(): raise ValueError("Columns to be encoded can not contain null") # Check that children and parents are disjoint children = set(self.feature_mapping.keys()) parents = set(self.feature_mapping.values()) if len(children.intersection(parents)) > 0: raise ValueError("No column should be a child and a parent") self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, handle_unknown="value", handle_missing="value", ) self.ordinal_encoder = self.ordinal_encoder.fit(X) X_ordinal = self.ordinal_encoder.transform(X) # Training self.mapping = self._train(X_ordinal, y) X_temp = self.transform(X, override_return_df=True) self.feature_names = X_temp.columns.tolist() # Store column names with approximately constant variance on the training data if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self