def train_my_model(train_dataset): clean_data, rusher_char = cleaning_blue_print(train_dataset, save=False) spatial_data = spatial_blue_print(train_dataset, save=False) # for testing purposes # clean_data = pd.read_csv('datasets/train_cleaned_data_v1_1.csv') # spatial_data = pd.read_csv('datasets/train_spatial_data_v1_1.csv') # print(clean_data.dtypes['GameSnap'], spatial_data.dtypes['_GameSnap']) total_data = pd.merge(clean_data, spatial_data, left_on='GameSnap', right_on='_GameSnap', how='left') dataset = total_data[total_data.QB1_offense_mean_distance.notnull()].drop( ['GameSnap', '_GameSnap'], axis=1) # in the dataset this column is not populated train, test = train_test_split(dataset, test_size=0.3, random_state=123) # print(train.isnull().sum().to_string()) x_train = train.drop(['Yards'], axis=1) x_test = test.drop(['Yards'], axis=1) y_train = train['Yards'] y_test = test['Yards'] # finding the categorical variables cat_features = train.select_dtypes(include=['object']).columns # Ordinal encoding of the categorical variables enc = OrdinalEncoder() enc.fit(dataset[cat_features]) x_train[cat_features] = enc.transform(x_train[cat_features]) # print(cat_features) # Random Forrest model RF_model = RandomForestRegressor() RF_model.fit(x_train, y_train) return (RF_model, enc, cat_features, rusher_char)
def test_ordinal_encoder_mixed_string_int_drop(self): data = [ ["c0.4", "c0.2", 3], ["c1.4", "c1.2", 0], ["c0.2", "c2.2", 1], ["c0.2", "c2.2", 1], ["c0.2", "c2.2", 1], ["c0.2", "c2.2", 1], ] test = [["c0.2", "c2.2", 1]] model = OrdinalEncoder(categories="auto") model.fit(data) inputs = [ ("input1", StringTensorType([None, 2])), ("input2", Int64TensorType([None, 1])), ] model_onnx = convert_sklearn(model, "ordinal encoder", inputs, target_opset=TARGET_OPSET) self.assertTrue(model_onnx is not None) dump_data_and_model( test, model, model_onnx, basename="SklearnOrdinalEncoderMixedStringIntDrop", allow_failure="StrictVersion(" "onnxruntime.__version__)" "<= StrictVersion('0.5.0')", )
class MatrixOrdinalAttribute(Attribute): """A (possibly) categorical attribute whose similarity is defined by a matrix""" def __init__(self, values, matrix, undefined=("n.a.",)): super().__init__() self.values = values self.matrix = matrix self.undefined = undefined self.n = len(values) self.encoder = None def get_description(self): return {"__class__": self.__class__.__module__ + "." + self.__class__.__name__, "values": self.values, "matrix": self.matrix, "undefined": self.undefined} def fit(self, X, y=None): self.encoder = OrdinalEncoder([self.values + list(self.undefined)], dtype=int) self.encoder.fit([[x] for x in self.values + list(self.undefined)]) # Argument irrelevant return self def transform(self, X, y=None): return self.encoder.transform(X) def similarity(self, x, y): if x >= self.n or y >= self.n: return np.nan return self.matrix[x][y]
def load_data(path,path_on): df = pd.read_csv(path, sep="\t") df = precession(df) #分析了模型的feature_importance之后,删去了一些重要性很低的特征 df.drop(['imeimd5', 'openudidmd5', 'os', 'adidmd5', 'idfamd5'], 1, inplace=True) #todo (待优化)缺失值填充成了empty, 它是原本数据的缺失值的填充字符,与原本数据保持一致,填充之后也作为特征的一种属性 df = df.fillna("empty") #线下训练数据 x = df.drop(['label','sid'],axis=1) y = df['label'] cols = x.columns #线上训练数据 df_on = pd.read_csv(path_on, sep="\t") df_on = precession(df_on) df_on.drop(['imeimd5', 'openudidmd5', 'os', 'adidmd5', 'idfamd5'], 1, inplace=True) x_on = df_on.drop(['sid'], axis=1) x_on = x_on.fillna("empty") #线上线下数据融合,防止编码过程中出现线下数据有编码,线上数据无编码的情况 x_all = pd.concat([x, x_on], 0) #把所有的字符编码成数字 oe = OrdinalEncoder() oe.fit(x_all) # 直接传入 他会自动将object类型换掉 x = oe.transform(x) print(x.shape) return x,y,oe,cols
class LinearOrdinalAttribute(Attribute): """A (possibly) categorical attribute whose similarity is linear with respect to a scale""" def __init__(self, order, undefined=("n.a.",)): """ Args: order (list): List of values, defining their ordering. undefined (iterable): Values which are recognized, but not comparable to the ranking. When such a value is found, the similarity returned is nan. """ super().__init__() self.order = order self.undefined = undefined self.n = len(order) self.encoder = None def get_description(self): return {"__class__": self.__class__.__module__ + "." + self.__class__.__name__, "order": self.order, "undefined": self.undefined} def fit(self, X, y=None): self.encoder = OrdinalEncoder([self.order + list(self.undefined)]) self.encoder.fit([[x] for x in self.order + list(self.undefined)]) # Argument irrelevant return self def transform(self, X, y=None): return self.encoder.transform(X) def similarity(self, x, y): if x >= self.n or y >= self.n: return np.nan return 1 - abs(x - y) / (self.n - 1)
def encode_categories(final_db): '''Take needed features from the dataset and encode string into categorical numbers Inputs: - final_db (Pandas dataframe): cleaned dataframe Outputs: - X (Pandas dataframe): feature matrix dimension NxD, where N is the datapoints number and D the number of features - y (numpy array): labels array (binary or multiclass), dimension Nx1 - enc (sklearn OrdinalEncoder): ordinal encoder used (to be used in decoding after) ''' # Loading data X = final_db.copy() X = X[[ 'exposure_type', 'obs_duration_mean', 'conc1_type', 'species', 'class', 'tax_order', 'family', 'genus', 'atom_number', 'alone_atom_number', 'tripleBond', 'doubleBond', 'bonds_number', 'ring_number', 'Mol', 'MorganDensity', 'LogP' ]] y = final_db.score.copy().values # Encoding phase enc = OrdinalEncoder(dtype=int) enc.fit(X[[ 'exposure_type', 'conc1_type', 'species', 'class', 'tax_order', 'family', 'genus' ]]) X[['exposure_type', 'conc1_type', 'species', 'class', 'tax_order', 'family', 'genus']] = \ enc.transform(X[['exposure_type', 'conc1_type', 'species', 'class', 'tax_order', 'family', 'genus']]) return X, y, enc
def test_custom_ordinal_time_comparison(X=None, iterations=10, verbose=1): if not X: X = np.array([ ["P", "+"], ["P2", "-"], ["P3", "-"], ]) custom_encoder = CustomOrdinalFeatureEncoder() ordinal_encoder = OrdinalEncoder() ordinal_encoder_time = [] custom_encoder_time = [] for i in range(iterations): ts = time() custom_encoder.fit(X) transformed = custom_encoder.transform(X) custom_encoder.inverse_transform(transformed) custom_encoder_time.append(time() - ts) ts = time() ordinal_encoder.fit(X) transformed = ordinal_encoder.transform(X) ordinal_encoder.inverse_transform(transformed) ordinal_encoder_time.append(time() - ts) custom_encoder_time = np.mean(custom_encoder_time) ordinal_encoder_time = np.mean(ordinal_encoder_time) if verbose: print(f"CustomEncoder -> Time: {custom_encoder_time}") print(f"OrdinalEncoder -> Time: {ordinal_encoder_time}") return custom_encoder_time, ordinal_encoder_time
def test_OrdinalEncoder(): expected = pd.DataFrame({"name": nominal, "feature": nominal}) oe = OrdinalEncoder() oe.fit(X[nominal]) assert feat(oe, nominal).equals(expected)
class NewOrdinalEncoder(OrdinalEncoder): """ comparable with null value & numerical input """ def __init__(self, category_cols: List[str], begin_idx=0) -> None: super(OrdinalEncoder, self).__init__() self.ordinal_encoder = OrdinalEncoder( # handle_unknown='use_encoded_value', unknown_value='null' ) self.category_cols = category_cols # self.null_map = {col: 'null' for col in self.category_cols} self.begin_idx = begin_idx def fit(self, X, y=None): # X.fillna(self.null_map, inplace=True) # X[self.category_cols] = X[self.category_cols].astype('str') self.ordinal_encoder.fit(X[self.category_cols]) return self def transform(self, X): # X[self.category_cols] = X[self.category_cols].astype('str') # X.fillna(self.null_map, inplace=True) X.loc[:, self.category_cols] = self.ordinal_encoder.transform( X[self.category_cols]).astype('int') + self.begin_idx return X
def load_data(path, path_on): df = pd.read_csv(path) df = precession(df) #分析了模型的feature_importance之后,删去了一些重要性很低的特征 #todo (待优化)缺失值填充成了empty, 它是原本数据的缺失值的填充字符,与原本数据保持一致,填充之后也作为特征的一种属性 df = df.fillna("empty") #线下训练数据 x = df.drop(['label', 'sid'], axis=1) y = df['label'] cols = x.columns #线上训练数据 df_on = pd.read_csv(path_on) df_on = precession(df_on) x_on = df_on.drop(['sid'], axis=1) x_on = x_on.fillna("empty") #线上线下数据融合,防止编码过程中出现线下数据有编码,线上数据无编码的情况 x_all = pd.concat([x, x_on], 0) print(x_all.shape) #把所有的字符编码成数字 oe = OrdinalEncoder() oe.fit(x_all) x = oe.transform(x) print(x.shape) return x, y, oe, cols
def set_miss_values(df, complete_index): enc_label = OrdinalEncoder() enc_fea = OrdinalEncoder() missing_index = complete_index[0] # Take out the existing numerical data (no NaN) and throw them in Random Forest Regressor train_df = df[complete_index] # known & unknow values known_values = np.array(train_df[train_df[missing_index].notnull()]) unknow_values = np.array(train_df[train_df[missing_index].isnull()]) # y is the know missing_index y = known_values[:, 0].reshape(-1, 1) enc_label.fit(y) y = enc_label.transform(y) # X are the features X = known_values[:, 1:] test_X = unknow_values[:, 1:] all_X = np.row_stack((X, test_X)) enc_fea.fit(all_X) X = enc_fea.transform(X) # fit rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1) rfr.fit(X, y.ravel()) # predict predicted_values = rfr.predict(enc_fea.transform(unknow_values[:, 1:])) predicted_values = enc_label.inverse_transform(predicted_values.reshape(-1, 1)) # fill in with predicted values df.loc[(df[missing_index].isnull()), missing_index] = predicted_values return df
def load_data_knn(DATA_PATH, encoding, seed=42): db = pd.read_csv(DATA_PATH).drop( columns=['Unnamed: 0', 'test_cas', 'smiles']) numerical = [ 'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP', 'alone_atom_number', 'doubleBond', 'tripleBond', 'ring_number', 'oh_count', 'MeltingPoint', 'WaterSolubility' ] # Categoriche + obs_duration_mean (già numeri) categorical = [ 'conc1_type', 'exposure_type', 'control_type', 'media_type', 'application_freq_unit', 'species', 'class', 'tax_order', 'family', 'genus' ] # MinMax trasform for numerical variables for nc in numerical: minmax = MinMaxScaler() minmax.fit(db[[nc]]) db[[nc]] = minmax.transform(db[[nc]]) # Ordinal Encoding for categorical variables encoder = OrdinalEncoder(dtype=int) encoder.fit(db[categorical]) db[categorical] = encoder.transform(db[categorical]) + 1 # Apro i pubchem db = pd.concat([db, pd.DataFrame(pd.DataFrame(db['pubchem2d'].values).\ apply(lambda x: x.str.replace('', ' ').str.strip().str.split(' '), axis = 1)[0].to_list(), columns = ['pub'+ str(i) for i in range(1,882)])], axis = 1) db.drop(columns=['fish'], inplace=True) # Encoding for target variable: binary and multiclass if encoding == 'binary': db['conc1_mean'] = np.where(db['conc1_mean'].values > 1, 0, 1) elif encoding == 'multiclass': t = db['conc1_mean'].copy() db['conc1_mean'] = multiclass_encoding(t) X = db.drop(columns='conc1_mean') y = db['conc1_mean'].values # splitting X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=seed) # ricongiungo train con test X_try = X_train.append(X_test) # tengo traccia della lunghezza del train set len_X_train = len(X_train) return X_try, X_train, X_test, y_train, y_test, len_X_train
def loadUnswNb15(folder, shuffleDataset=False, randomState=None): xEncoder = OrdinalEncoder() folder = Path(folder) trainingSetPath = folder / 'UNSW_NB15_training-set.csv' testingSetPath = folder / 'UNSW_NB15_testing-set.csv' trainingSet = pd.read_csv(str(trainingSetPath)) testingSet = pd.read_csv(str(testingSetPath)) trainingY = trainingSet['attack_cat'].values trainingX = trainingSet.drop(columns=['id', 'attack_cat', 'label']) testingY = testingSet['attack_cat'].values testingX = testingSet.drop(columns=['id', 'attack_cat', 'label']) xEncoder.fit( pd.concat([ trainingX[['proto', 'service', 'state']], testingX[['proto', 'service', 'state']] ], ignore_index=True)) trainingX[['proto', 'service', 'state']] = \ xEncoder.transform(trainingX[['proto', 'service', 'state']]) trainingX = trainingX.values testingX[['proto', 'service', 'state']] = \ xEncoder.transform(testingX[['proto', 'service', 'state']]) testingX = testingX.values if shuffleDataset: trainingX, trainingY = shuffle(trainingX, trainingY, random_state=randomState) testingX, testingY = shuffle(testingX, testingY, random_state=randomState) return trainingX, testingX, trainingY, testingY
def _encode_feature(self, mat, feature_column): feat = mat[feature_column].to_numpy().reshape(-1, 1) enc = OrdinalEncoder() enc.fit(feat) self.feature_encoders[feature_column] = enc mat.loc[:,feature_column] = enc.transform(feat) return mat
def test_ordinal_encoder_handle_unknowns_raise(params, err_type, err_msg): # Check error message when validating input parameters X = np.array([['a', 'x'], ['b', 'y']], dtype=object) encoder = OrdinalEncoder(**params) with pytest.raises(err_type, match=err_msg): encoder.fit(X)
def dataWash(city, path: str): weather = pd.read_csv(path) X = weather.iloc[:, :-1] Y = weather.loc[:, ("Location", "RainTomorrow")] X = X.loc[X.loc[:, "Location"] == city] Y = Y.loc[Y.loc[:, "Location"] == city] Y = Y.drop(['Location'], axis=1) X = X.drop(['Location'], axis=1) #get month X["Date"] = X["Date"].apply(lambda x: int(x.split("/")[1])) X = X.rename(columns={"Date": "Month"}) #fill Null object-data up with most frequent value cate = X.columns[X.dtypes == "object"].tolist() si = SimpleImputer(missing_values=np.nan, strategy="most_frequent") si.fit(X.loc[:, cate]) X.loc[:, cate] = si.transform(X.loc[:, cate]) #encode object data oe = OrdinalEncoder() oe = oe.fit(X.loc[:, cate]) X.loc[:, cate] = oe.transform(X.loc[:, cate]) oe = oe.fit(Y.loc[:, :]) Y.loc[:, :] = oe.transform(Y.loc[:, :]) #fill float data up with mean value. col = X.columns[X.dtypes == "float64"].tolist() impmean = SimpleImputer(missing_values=np.nan, strategy="mean") impmean = impmean.fit(X.loc[:, col]) X.loc[:, col] = impmean.transform(X.loc[:, col]) return X, Y
def labelEncoding(self, data_column): logger.info('[{}] : [INFO] Label encoding ...'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) enc = OrdinalEncoder() enc.fit(data_column) enc_data_column = enc.transform(data_column) return enc_data_column
def regplot(X, y): '''Function for plotting the variables of input X against the target y''' fig, axes = plt.subplots(2, 3, figsize=(16, 8)) fig.suptitle('charges for insurance') sns.regplot(ax=axes[0, 0], x='bmi', y=y, data=X) #axes[0].set_xlabel('bmi') sns.regplot(ax=axes[0, 1], x='children', y=y, data=X) sns.regplot(ax=axes[1, 0], x='age', y=y, data=X) sns.regplot(ax=axes[1, 1], x='age', y=y, data=X) ###### encoding of ordinal data ordinal_encoder = OrdinalEncoder(categories=[['no', 'yes']]) ordinal_encoder.fit(X[['smoker']]) smoker_encoded = pd.DataFrame(ordinal_encoder.transform(X[['smoker']])) ###### sns.regplot(ax=axes[0, 2], x=smoker_encoded, y=y) axes[0, 2].set_xlabel('smoker(0:No, 1:Yes]') ordinal_encoder1 = OrdinalEncoder(categories=[['female', 'male']]) ordinal_encoder1.fit(X[['sex']]) sex_encoded = pd.DataFrame(ordinal_encoder1.transform(X[['sex']])) sns.regplot(ax=axes[1, 2], x=sex_encoded, y=y) axes[1, 2].set_xlabel('sex(0:f, 1:m]') axes[1, 2].set_xlim(0, 1) axes[1, 2].set_ylim(0, 60000) plt.show()
class SklearnEncoder(object): def __init__(self, encoder_type): self.encoder_type = encoder_type if self.encoder_type == "Label": self.encoder_module = LabelEncoder() elif self.encoder_type == "OneHot": self.encoder_module = OneHotEncoder() elif self.encoder_type == "Ordinal": # 序数编码 self.encoder_module = OrdinalEncoder() def _fit(self, x, y=None): if self.encoder_type == "Label": self.encoder_module.fit(y=x) else: self.encoder_module.fit(X=x, y=y) def _transform(self, x): if self.encoder_type == "Label": return self.encoder_module.transform(y=x) else: return self.encoder_module.transform(X=x) def _fit_transform(self, x, y=None): if self.encoder_type == "Label": return self.encoder_module.fit_transform(y=x) else: return self.encoder_module.fit_transform(X=x, y=y) def _reversal(self, x): # 与transform的操作刚好相反 return self.encoder_module.inverse_transform(X=x)
def preprocess(df): cat_vars = df.columns[df.dtypes == object] c = cat_vars.tolist() c.remove('STATUS') c.append('REHIRE') c.append('JOB_SATISFACTION') for var in c: cat_list ='var'+'_'+var cat_list = pd.get_dummies(df[var], prefix=var) data1 = df.join(cat_list) df = data1 data_vars = df.columns.values.tolist() to_keep = [i for i in data_vars if i not in c] to_keep.remove('TERMINATION_YEAR') to_keep.remove('EMP_ID') data_final = df[to_keep] col1 = data_final.columns.tolist() col1.remove('STATUS') col2 = 'STATUS' X = data_final[col1] y = data_final[col2] temp = np.array(y).reshape(-1,1) encoder = OrdinalEncoder() encoder.fit(temp) y = encoder.transform(temp) y = y.ravel() return X, y
class OrdinalEncodeCategoricalVariables(BaseEstimator, TransformerMixin): # order and encode categorical variables # self.variables --> CATEGORICAL_VARIABLES def __init__(self, variables=None): if not isinstance(variables, list): self.variables = [variables] else: self.variables = variables def fit(self, X, y=None): # get_dummies isn't appropriate so use ordinal_map # add points column to X so groupby works! #X = X.copy() #print() #print(X.dtypes) self.enc = OrdinalEncoder() self.enc.fit(X[self.variables]) return self def transform(self, X): X[self.variables] = self.enc.transform(X[self.variables]) #print() #print(X.dtypes) return X
def change_Categorical_ord_5_Data(input_train_data, input_test_data): encoder = OrdinalEncoder(categories='auto') encoder.fit(input_train_data.ord_5.values.reshape(-1, 1)) input_train_data.ord_5 = encoder.transform( input_train_data.ord_5.values.reshape(-1, 1)) input_test_data.ord_5 = encoder.transform( input_test_data.ord_5.values.reshape(-1, 1)) return input_train_data, input_test_data
def test_ordinal_encoder_handle_unknown_string_dtypes(X_train, X_test): """Checks that ordinal encoder transforms string dtypes. Non-regression test for #19872.""" enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-9) enc.fit(X_train) X_trans = enc.transform(X_test) assert_allclose(X_trans, [[-9, 0]])
def test_ordinalencoder(): X0 = [["Male", 1], ["Female", 3], ["Female", 2]] X1 = [["Male", 1], ["Female", 27], ["Bananas", 2]] for X in [X0, X1]: ohe = OrdinalEncoder() ohe.fit(X) ohe_ = convert_estimator(ohe) assert np.allclose(ohe.transform(X), ohe_.transform(X))
def encodeOrdinal(data, col_names): # creating instance of encoder ordinal_encoder = OrdinalEncoder() # Assigning numerical values and storing in another column ordinal_encoder.fit(data[col_names]) data[col_names] = ordinal_encoder.transform(data[col_names]) return data, ordinal_encoder
def test_ordinal_encoder_raise_categories_shape(): X = np.array([['Low', 'Medium', 'High', 'Medium', 'Low']], dtype=object).T cats = ['Low', 'Medium', 'High'] enc = OrdinalEncoder(categories=cats) msg = ("Shape mismatch: if categories is an array,") with pytest.raises(ValueError, match=msg): enc.fit(X)
def test_ordinal_encoder_handle_unknowns_nan(): # Make sure unknown_value=np.nan properly works enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan) X_fit = np.array([[1], [2], [3]]) enc.fit(X_fit) X_trans = enc.transform([[1], [2], [4]]) assert_array_equal(X_trans, [[0], [1], [np.nan]])
def get_ordinalencoder(df: pd.DataFrame) -> OrdinalEncoder: ordcol = set(df.columns) ordcol = list(ordcol) ordcol.sort() enc = OrdinalEncoder() enc.fit(df[ordcol].values) return enc, ordcol
def test_ordinal_encoder_raise_categories_shape(): X = np.array([["Low", "Medium", "High", "Medium", "Low"]], dtype=object).T cats = ["Low", "Medium", "High"] enc = OrdinalEncoder(categories=cats) msg = "Shape mismatch: if categories is an array," with pytest.raises(ValueError, match=msg): enc.fit(X)
class OrdinalEncoding(AutoSklearnPreprocessingAlgorithm): def __init__( self, random_state: Optional[np.random.RandomState] = None, ): self.random_state = random_state def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None) -> 'OrdinalEncoding': if not scipy.sparse.issparse(X): self.preprocessor = OrdinalEncoder( categories='auto', handle_unknown='use_encoded_value', unknown_value=-1, ) self.preprocessor.fit(X, y) return self def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE: if scipy.sparse.issparse(X): # Sparse data should be float dtype, which means we do not need # to further encode it. return X if self.preprocessor is None: raise NotImplementedError() # Notice we are shifting the unseen categories during fit to 1 # from -1, 0, ... to 0,..., cat + 1 # This is done because Category shift requires non negative integers # Consider removing this if that step is removed return self.preprocessor.transform(X) + 1 @staticmethod def get_properties( dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: return { 'shortname': 'OrdinalEncoder', 'name': 'Ordinal Encoder', 'handles_regression': True, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': True, 'handles_multioutput': True, # TODO find out of this is right! 'handles_sparse': True, 'handles_dense': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), 'output': (INPUT, ), } @staticmethod def get_hyperparameter_search_space( dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, ) -> ConfigurationSpace: return ConfigurationSpace()
def test_ordinal_encoder_raise_missing(X): ohe = OrdinalEncoder() with pytest.raises(ValueError, match="Input contains NaN"): ohe.fit(X) with pytest.raises(ValueError, match="Input contains NaN"): ohe.fit_transform(X) ohe.fit(X[:1, :]) with pytest.raises(ValueError, match="Input contains NaN"): ohe.transform(X)
def test_ordinal_encoder_specified_categories(X, X2, cats, cat_dtype): enc = OrdinalEncoder(categories=cats) exp = np.array([[0.], [1.]]) assert_array_equal(enc.fit_transform(X), exp) assert list(enc.categories[0]) == list(cats[0]) assert enc.categories_[0].tolist() == list(cats[0]) # manually specified categories should have same dtype as # the data when coerced from lists assert enc.categories_[0].dtype == cat_dtype # when specifying categories manually, unknown categories should already # raise when fitting enc = OrdinalEncoder(categories=cats) with pytest.raises(ValueError, match="Found unknown categories"): enc.fit(X2)