def fit(self, X, y, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # unite the input into pandas types X = util.convert_input(X) y = util.convert_input_vector(y, X.index).astype(float) if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.use_default_cols: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) if self.handle_missing == 'error': if X[self.cols].isnull().any().any(): raise ValueError('Columns to be encoded can not contain null') categories = self._fit(X, y, cols=self.cols) self.mapping = categories X_temp = self.transform(X, y, override_return_df=True) self.feature_names = X_temp.columns.tolist() if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self
def fit(self, X, y, **kwargs): # unite the input into pandas types X = utils.convert_input(X) y = utils.convert_input(y) y.columns = ['target'] # apply one-hot-encoder on the label self.label_encoder = encoders.OneHotEncoder(handle_missing='error', handle_unknown='error', cols=['target'], drop_invariant=True, use_cat_names=True) labels = self.label_encoder.fit_transform(y) labels.columns = [column[7:] for column in labels.columns] labels = labels.iloc[:, 1:] # drop one label # train the feature encoders for class_name, label in labels.iteritems(): self.feature_encoders[class_name] = copy.deepcopy(self.feature_encoder).fit(X, label)
def transform(self, X, y=None, override_return_df=False): """Perform the transformation to new categorical data. Parameters ---------- X : array-like, shape = [n_samples, n_features] y : array-like, shape = [n_samples] when transform by leave one out None, when transform without target information (such as transform test set) Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ if self.handle_missing == 'error': if X[self.cols].isnull().any().any(): raise ValueError('Columns to be encoded can not contain null') if self._dim is None: raise ValueError( 'Must train encoder before it can be used to transform data.') # unite the input into pandas types X = util.convert_input(X) # then make sure that it is the right size if X.shape[1] != self._dim: raise ValueError('Unexpected input dimension %d, expected %d' % ( X.shape[1], self._dim, )) # if we are encoding the training data, we have to check the target if y is not None: y = util.convert_input_vector(y, X.index).astype(float) if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") if not self.cols: return X X = self._transform(X, y, mapping=self.mapping) if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) if self.return_df or override_return_df: return X else: return X.values
def transform(self, X, override_return_df=False): """ Call _transform() if you want to use single CPU with all samples """ if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') # first check the type self.X = util.convert_input(X) self.data_lines = len(self.X) # then make sure that it is the right size if self.X.shape[1] != self._dim: raise ValueError('Unexpected input dimension %d, expected %d' % (self.X.shape[1], self._dim, )) if not self.cols: return self.X data_lock = multiprocessing.Manager().Lock() new_start = multiprocessing.Manager().Value('d', True) done_index = multiprocessing.Manager().Value('d', int(0)) hashing_parts = multiprocessing.Manager().Queue() if self.auto_sample: self.max_sample = int(self.data_lines / self.max_process) if self.max_process == 1: self.require_data(self, data_lock, new_start, done_index, hashing_parts, cols=self.cols, process_index=1) else: n_process = [] for thread_index in range(self.max_process): process = multiprocessing.Process(target=self.require_data, args=(self, data_lock, new_start, done_index, hashing_parts, self.cols, thread_index + 1)) process.daemon = True n_process.append(process) for process in n_process: process.start() for process in n_process: process.join() data = self.X if self.max_sample == 0 or self.max_sample == self.data_lines: if hashing_parts: data = list(hashing_parts.get().values())[0] else: list_data = {} while not hashing_parts.empty(): list_data.update(hashing_parts.get()) sort_data = [] for part_index in sorted(list_data): sort_data.append(list_data[part_index]) if sort_data: data = pd.concat(sort_data, ignore_index=True) # Check if is_return_df if self.return_df or override_return_df: return data else: return data.values
def transform(self, X, override_return_df=False): """Perform the transformation to new categorical data. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ if self.handle_missing == 'error': if X[self.cols].isnull().any().any(): raise ValueError('Columns to be encoded can not contain null') if self._dim is None: raise ValueError( 'Must train encoder before it can be used to transform data.') # first check the type X = util.convert_input(X) # then make sure that it is the right size if X.shape[1] != self._dim: raise ValueError('Unexpected input dimension %d, expected %d' % ( X.shape[1], self._dim, )) if not self.cols: return X X = self.ordinal_encoder.transform(X) if self.handle_unknown == 'error': if X[self.cols].isin([-1]).any().any(): raise ValueError( 'Columns to be encoded can not contain new values') X = self.polynomial_coding(X, self.mapping) if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) if self.return_df or override_return_df: return X else: return X.values
def inverse_transform(self, X_in): """ Perform the inverse transformation to encoded data. Will attempt best case reconstruction, which means it will return nan for handle_missing and handle_unknown settings that break the bijection. We issue warnings when some of those cases occur. Parameters ---------- X_in : array-like, shape = [n_samples, n_features] Returns ------- p: array, the same size of X_in """ # fail fast if self._dim is None: raise ValueError('Must train encoder before it can be used to inverse_transform data') # first check the type and make deep copy X = util.convert_input(X_in, deep=True) # then make sure that it is the right size if X.shape[1] != self._dim: if self.drop_invariant: raise ValueError("Unexpected input dimension %d, the attribute drop_invariant should " "be False when transforming the data" % (X.shape[1],)) else: raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,)) if not self.cols: return X if self.return_df else X.values if self.handle_unknown == 'value': for col in self.cols: if any(X[col] == -1): warnings.warn("inverse_transform is not supported because transform impute " "the unknown category -1 when encode %s" % (col,)) if self.handle_unknown == 'return_nan' and self.handle_missing == 'return_nan': for col in self.cols: if X[col].isnull().any(): warnings.warn("inverse_transform is not supported because transform impute " "the unknown category nan when encode %s" % (col,)) for switch in self.mapping: column_mapping = switch.get('mapping') inverse = pd.Series(data=column_mapping.index, index=column_mapping.values) X[switch.get('col')] = X[switch.get('col')].map(inverse).astype(switch.get('data_type')) return X if self.return_df else X.values
def fit_transform(self, X, y=None, **fit_params): # When we are training the feature encoders, we have to use fit_transform() method on the features. # unite the input into pandas types X = utils.convert_input(X) y = utils.convert_input(y) y.columns = ['target'] # apply one-hot-encoder on the label self.label_encoder = encoders.OneHotEncoder(handle_missing='error', handle_unknown='error', cols=['target'], drop_invariant=True, use_cat_names=True) labels = self.label_encoder.fit_transform(y) labels.columns = [column[7:] for column in labels.columns] labels = labels.iloc[:, 1:] # drop one label # initialization of the feature encoders encoded = None feature_encoder = None all_new_features = pd.DataFrame() # fit_transform the feature encoders for class_name, label in labels.iteritems(): feature_encoder = copy.deepcopy(self.feature_encoder) encoded = feature_encoder.fit_transform(X, label) # decorate the encoded features with the label class suffix new_features = encoded[feature_encoder.cols] new_features.columns = [str(column) + '_' + class_name for column in new_features.columns] all_new_features = pd.concat((all_new_features, new_features), axis=1) self.feature_encoders[class_name] = feature_encoder # add features that were not encoded result = pd.concat((encoded[encoded.columns[~encoded.columns.isin(feature_encoder.cols)]], all_new_features), axis=1) return result
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = util.convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) X_temp = self.transform(X, override_return_df=True) self.feature_names = X_temp.columns.tolist() # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self
def inverse_transform(self, X_in): """ Perform the inverse transformation to encoded data. Parameters ---------- X_in : array-like, shape = [n_samples, n_features] Returns ------- p: array, the same size of X_in """ # fail fast if self._dim is None: raise ValueError('Must train encoder before it can be used to inverse_transform data') # unite the type into pandas dataframe (it makes the input size detection code easier...) and make deep copy X = util.convert_input(X_in, columns=self.feature_names, deep=True) X = self.basen_to_integer(X, self.cols, self.base) # make sure that it is the right size if X.shape[1] != self._dim: if self.drop_invariant: raise ValueError("Unexpected input dimension %d, the attribute drop_invariant should " "be False when transforming the data" % (X.shape[1],)) else: raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,)) if not self.cols: return X if self.return_df else X.values for switch in self.ordinal_encoder.mapping: column_mapping = switch.get('mapping') inverse = pd.Series(data=column_mapping.index, index=column_mapping.values) X[switch.get('col')] = X[switch.get('col')].map(inverse).astype(switch.get('data_type')) if self.handle_unknown == 'return_nan' and self.handle_missing == 'return_nan': for col in self.cols: if X[switch.get('col')].isnull().any(): warnings.warn("inverse_transform is not supported because transform impute " "the unknown category nan when encode %s" % (col,)) return X if self.return_df else X.values
def transform(self, X, y=None): """Perform the transformation to new categorical data. Parameters ---------- X : array-like, shape = [n_samples, n_features] y : array-like, shape = [n_samples] Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ if self._dim is None: raise ValueError( 'Must train encoder before it can be used to transform data.') # first check the type X = util.convert_input(X) # then make sure that it is the right size if X.shape[1] != self._dim: raise ValueError('Unexpected input dimension %d, expected %d' % ( X.shape[1], self._dim, )) if not self.cols: return X X, _ = self._transform_count_encode(X, y) if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) if self.return_df: return X else: return X.values
def fit(self, X, y=None, **kwargs): """Fit encoder according to X. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = util.convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) self._check_set_create_dict_attrs() self._fit_count_encode(X, y) if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] return self
def transform(self, X): # unite the input into pandas types X = utils.convert_input(X) # initialization encoded = None feature_encoder = None all_new_features = pd.DataFrame() # transform the features for class_name, feature_encoder in self.feature_encoders.items(): encoded = feature_encoder.transform(X) # decorate the encoded features with the label class suffix new_features = encoded[feature_encoder.cols] new_features.columns = [str(column) + '_' + class_name for column in new_features.columns] all_new_features = pd.concat((all_new_features, new_features), axis=1) # add features that were not encoded result = pd.concat((encoded[encoded.columns[~encoded.columns.isin(feature_encoder.cols)]], all_new_features), axis=1) return result
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) # first check the type X = util.convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) if self.handle_missing == 'error': if X[self.cols].isnull().any().any(): raise ValueError('Columns to be encoded can not contain null') # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose, cols=self.cols, handle_unknown='value', handle_missing='value') self.ordinal_encoder = self.ordinal_encoder.fit(X) ordinal_mapping = self.ordinal_encoder.category_mapping mappings_out = [] for switch in ordinal_mapping: values = switch.get('mapping') col = switch.get('col') column_mapping = self.fit_polynomial_coding( col, values, self.handle_missing, self.handle_unknown) mappings_out.append({ 'col': switch.get('col'), 'mapping': column_mapping, }) self.mapping = mappings_out X_temp = self.transform(X, override_return_df=True) self.feature_names = X_temp.columns.tolist() # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self
def transform(self, X, y=None, override_return_df=False): """Perform the transformation to new categorical data. When the data are used for model training, it is important to also pass the target in order to apply leave one out. Parameters ---------- X : array-like, shape = [n_samples, n_features] y : array-like, shape = [n_samples] when transform by leave one out None, when transform without target information (such as transform test set) Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ if self.handle_missing == 'error': if X[self.cols].isnull().any().any(): raise ValueError('Columns to be encoded can not contain null') if self._dim is None: raise ValueError( 'Must train encoder before it can be used to transform data.') # Unite the input into pandas types X = util.convert_input(X) # Then make sure that it is the right size if X.shape[1] != self._dim: raise ValueError('Unexpected input dimension %d, expected %d' % ( X.shape[1], self._dim, )) # If we are encoding the training data, we have to check the target if y is not None: y = util.convert_input_vector(y, X.index).astype(float) if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") if not self.cols: return X # Do not modify the input argument X = X.copy(deep=True) X = self.ordinal_encoder.transform(X) if self.handle_unknown == 'error': if X[self.cols].isin([-1]).any().any(): raise ValueError('Unexpected categories found in dataframe') # Loop over columns and replace nominal values with WOE X = self._score(X, y) # Postprocessing # Note: We should not even convert these columns. if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) if self.return_df or override_return_df: return X else: return X.values
def fit(self, X, y, **kwargs): """Fit encoder according to X and binary y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Binary target values. Returns ------- self : encoder Returns self. """ # Unite parameters into pandas types X = util.convert_input(X) y = util.convert_input_vector(y, X.index).astype(float) # The lengths must be equal if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") self._dim = X.shape[1] # If columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) if self.handle_missing == 'error': if X[self.cols].isnull().any().any(): raise ValueError('Columns to be encoded can not contain null') self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose, cols=self.cols, handle_unknown='value', handle_missing='value') self.ordinal_encoder = self.ordinal_encoder.fit(X) X_ordinal = self.ordinal_encoder.transform(X) # Training if self.model == 'independent': self.mapping = self._train_independent(X_ordinal, y) elif self.model == 'pooled': self.mapping = self._train_pooled(X_ordinal, y) elif self.model == 'beta': self.mapping = self._train_beta(X_ordinal, y) elif self.model == 'binary': # The label must be binary with values {0,1} unique = y.unique() if len(unique) != 2: raise ValueError( "The target column y must be binary. But the target contains " + str(len(unique)) + " unique value(s).") if y.isnull().any(): raise ValueError( "The target column y must not contain missing values.") if np.max(unique) < 1: raise ValueError( "The target column y must be binary with values {0, 1}. Value 1 was not found in the target." ) if np.min(unique) > 0: raise ValueError( "The target column y must be binary with values {0, 1}. Value 0 was not found in the target." ) # Perform the training self.mapping = self._train_log_odds_ratio(X_ordinal, y) else: raise ValueError("model='" + str(self.model) + "' is not a recognized option") X_temp = self.transform(X, override_return_df=True) self.feature_names = X_temp.columns.tolist() # Store column names with approximately constant variance on the training data if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self