def test_ordinal_encoder_raise_missing(X): ohe = OrdinalEncoder() with pytest.raises(ValueError, match="Input contains NaN"): ohe.fit(X) with pytest.raises(ValueError, match="Input contains NaN"): ohe.fit_transform(X) ohe.fit(X[:1, :]) with pytest.raises(ValueError, match="Input contains NaN"): ohe.transform(X)
class OrdinalEncodeCategoricalVariables(BaseEstimator, TransformerMixin): # order and encode categorical variables # self.variables --> CATEGORICAL_VARIABLES def __init__(self, variables=None): if not isinstance(variables, list): self.variables = [variables] else: self.variables = variables def fit(self, X, y=None): # get_dummies isn't appropriate so use ordinal_map # add points column to X so groupby works! #X = X.copy() print() print(X.dtypes) self.enc = OrdinalEncoder() self.enc.fit(X[self.variables]) return self def transform(self, X): X[self.variables] = self.enc.transform(X[self.variables]) print() print(X.dtypes) return X
def load_data(path,path_on): df = pd.read_csv(path, sep="\t") df = precession(df) #分析了模型的feature_importance之后,删去了一些重要性很低的特征 df.drop(['imeimd5', 'openudidmd5', 'os', 'adidmd5', 'idfamd5'], 1, inplace=True) #todo (待优化)缺失值填充成了empty, 它是原本数据的缺失值的填充字符,与原本数据保持一致,填充之后也作为特征的一种属性 df = df.fillna("empty") #线下训练数据 x = df.drop(['label','sid'],axis=1) y = df['label'] cols = x.columns #线上训练数据 df_on = pd.read_csv(path_on, sep="\t") df_on = precession(df_on) df_on.drop(['imeimd5', 'openudidmd5', 'os', 'adidmd5', 'idfamd5'], 1, inplace=True) x_on = df_on.drop(['sid'], axis=1) x_on = x_on.fillna("empty") #线上线下数据融合,防止编码过程中出现线下数据有编码,线上数据无编码的情况 x_all = pd.concat([x, x_on], 0) #把所有的字符编码成数字 oe = OrdinalEncoder() oe.fit(x_all) # 直接传入 他会自动将object类型换掉 x = oe.transform(x) print(x.shape) return x,y,oe,cols
def get_sklearn_accuracy(mode): """ Function to find accuracy using sklearn decision tree classifier Input: mode: variable to indicate test or train accuracy Output: accuracy score for sklearn model """ if not mode: raise ValueError("Specify mode") # Read dataset train = pd.read_csv("data/small_train.csv") test = pd.read_csv("data/small_test.csv") # Encode categorical variables encoder = OrdinalEncoder() encoded_train = encoder.fit_transform(train) encoded_test = encoder.transform(test) # Split data into train and test dataset x_train, y_train = encoded_train[:, :-1], encoded_train[:, -1] x_test, y_test = encoded_test[:, :-1], encoded_test[:, -1] # Instantiate sklearn classifier and train on training dataset model = DecisionTreeClassifier() mod = model.fit(x_train, y_train) # Evaluate model for training dataset and test dataset accuracy if mode == "train": predictions = mod.predict(x_train) return accuracy_score(y_train, predictions) predictions = mod.predict(x_test) return accuracy_score(y_test, predictions)
class MatrixOrdinalAttribute(Attribute): """A (possibly) categorical attribute whose similarity is defined by a matrix""" def __init__(self, values, matrix, undefined=("n.a.",)): super().__init__() self.values = values self.matrix = matrix self.undefined = undefined self.n = len(values) self.encoder = None def get_description(self): return {"__class__": self.__class__.__module__ + "." + self.__class__.__name__, "values": self.values, "matrix": self.matrix, "undefined": self.undefined} def fit(self, X, y=None): self.encoder = OrdinalEncoder([self.values + list(self.undefined)], dtype=int) self.encoder.fit([[x] for x in self.values + list(self.undefined)]) # Argument irrelevant return self def transform(self, X, y=None): return self.encoder.transform(X) def similarity(self, x, y): if x >= self.n or y >= self.n: return np.nan return self.matrix[x][y]
class LinearOrdinalAttribute(Attribute): """A (possibly) categorical attribute whose similarity is linear with respect to a scale""" def __init__(self, order, undefined=("n.a.",)): """ Args: order (list): List of values, defining their ordering. undefined (iterable): Values which are recognized, but not comparable to the ranking. When such a value is found, the similarity returned is nan. """ super().__init__() self.order = order self.undefined = undefined self.n = len(order) self.encoder = None def get_description(self): return {"__class__": self.__class__.__module__ + "." + self.__class__.__name__, "order": self.order, "undefined": self.undefined} def fit(self, X, y=None): self.encoder = OrdinalEncoder([self.order + list(self.undefined)]) self.encoder.fit([[x] for x in self.order + list(self.undefined)]) # Argument irrelevant return self def transform(self, X, y=None): return self.encoder.transform(X) def similarity(self, x, y): if x >= self.n or y >= self.n: return np.nan return 1 - abs(x - y) / (self.n - 1)
def encode_categories(final_db): '''Take needed features from the dataset and encode string into categorical numbers Inputs: - final_db (Pandas dataframe): cleaned dataframe Outputs: - X (Pandas dataframe): feature matrix dimension NxD, where N is the datapoints number and D the number of features - y (numpy array): labels array (binary or multiclass), dimension Nx1 - enc (sklearn OrdinalEncoder): ordinal encoder used (to be used in decoding after) ''' # Loading data X = final_db.copy() X = X[[ 'exposure_type', 'obs_duration_mean', 'conc1_type', 'species', 'class', 'tax_order', 'family', 'genus', 'atom_number', 'alone_atom_number', 'tripleBond', 'doubleBond', 'bonds_number', 'ring_number', 'Mol', 'MorganDensity', 'LogP' ]] y = final_db.score.copy().values # Encoding phase enc = OrdinalEncoder(dtype=int) enc.fit(X[[ 'exposure_type', 'conc1_type', 'species', 'class', 'tax_order', 'family', 'genus' ]]) X[['exposure_type', 'conc1_type', 'species', 'class', 'tax_order', 'family', 'genus']] = \ enc.transform(X[['exposure_type', 'conc1_type', 'species', 'class', 'tax_order', 'family', 'genus']]) return X, y, enc
def test_custom_ordinal_time_comparison(X=None, iterations=10, verbose=1): if not X: X = np.array([ ["P", "+"], ["P2", "-"], ["P3", "-"], ]) custom_encoder = CustomOrdinalFeatureEncoder() ordinal_encoder = OrdinalEncoder() ordinal_encoder_time = [] custom_encoder_time = [] for i in range(iterations): ts = time() custom_encoder.fit(X) transformed = custom_encoder.transform(X) custom_encoder.inverse_transform(transformed) custom_encoder_time.append(time() - ts) ts = time() ordinal_encoder.fit(X) transformed = ordinal_encoder.transform(X) ordinal_encoder.inverse_transform(transformed) ordinal_encoder_time.append(time() - ts) custom_encoder_time = np.mean(custom_encoder_time) ordinal_encoder_time = np.mean(ordinal_encoder_time) if verbose: print(f"CustomEncoder -> Time: {custom_encoder_time}") print(f"OrdinalEncoder -> Time: {ordinal_encoder_time}") return custom_encoder_time, ordinal_encoder_time
def labelEncoding(self, data_column): logger.info('[{}] : [INFO] Label encoding ...'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) enc = OrdinalEncoder() enc.fit(data_column) enc_data_column = enc.transform(data_column) return enc_data_column
def load_data_rasar(DATA_PATH, encoding, seed=42): db = pd.read_csv(DATA_PATH).drop( columns=['Unnamed: 0', 'test_cas', 'smiles']) numerical = [ 'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP', 'alone_atom_number', 'doubleBond', 'tripleBond', 'ring_number', 'oh_count', 'MeltingPoint', 'WaterSolubility' ] # Categoriche + obs_duration_mean (già numeri) categorical = [ 'conc1_type', 'exposure_type', 'control_type', 'media_type', 'application_freq_unit', 'species', 'class', 'tax_order', 'family', 'genus' ] # MinMax trasform for numerical variables for nc in numerical: minmax = MinMaxScaler() minmax.fit(db[[nc]]) db[[nc]] = minmax.transform(db[[nc]]) # Ordinal Encoding for categorical variables encoder = OrdinalEncoder(dtype=int) encoder.fit(db[categorical]) db[categorical] = encoder.transform(db[categorical]) + 1 # Apro i pubchem db = pd.concat([db, pd.DataFrame(pd.DataFrame(db['pubchem2d'].values).\ apply(lambda x: x.str.replace('', ' ').str.strip().str.split(' '), axis = 1)[0].to_list(), columns = ['pub'+ str(i) for i in range(1,882)])], axis = 1) db.drop(columns=['fish'], inplace=True) # Encoding for target variable: binary and multiclass if encoding == 'binary': db['conc1_mean'] = np.where(db['conc1_mean'].values > 1, 0, 1) elif encoding == 'multiclass': t = db['conc1_mean'].copy() db['conc1_mean'] = multiclass_encoding(t) X = db.drop(columns='conc1_mean') y = db['conc1_mean'].values # splitting X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=seed) # ricongiungo train con test X_try = X_train.append(X_test) # tengo traccia della lunghezza del train set len_X_train = len(X_train) return X_try, X_train, X_test, y_train, y_test, len_X_train
def test_ordinal_encoder_missing_value_support_pandas_categorical(pd_nan_type): """Check ordinal encoder is compatible with pandas.""" # checks pandas dataframe with categorical features pd = pytest.importorskip("pandas") pd_missing_value = pd.NA if pd_nan_type == "pd.NA" else np.nan df = pd.DataFrame( { "col1": pd.Series(["c", "a", pd_missing_value, "b", "a"], dtype="category"), } ) oe = OrdinalEncoder().fit(df) assert len(oe.categories_) == 1 assert_array_equal(oe.categories_[0][:3], ["a", "b", "c"]) assert np.isnan(oe.categories_[0][-1]) df_trans = oe.transform(df) assert_allclose(df_trans, [[2.0], [0.0], [np.nan], [1.0], [0.0]]) X_inverse = oe.inverse_transform(df_trans) assert X_inverse.shape == (5, 1) assert_array_equal(X_inverse[:2, 0], ["c", "a"]) assert_array_equal(X_inverse[3:, 0], ["b", "a"]) assert np.isnan(X_inverse[2, 0])
def test_ordinal_encoder_missing_value_support_pandas_categorical(pd_nan_type): """Check ordinal encoder is compatible with pandas.""" # checks pandas dataframe with categorical features if pd_nan_type == 'pd.NA': # pd.NA is in pandas 1.0 pd = pytest.importorskip('pandas', minversion="1.0") pd_missing_value = pd.NA else: # np.nan pd = pytest.importorskip('pandas') pd_missing_value = np.nan df = pd.DataFrame({ 'col1': pd.Series(['c', 'a', pd_missing_value, 'b', 'a'], dtype='category'), }) oe = OrdinalEncoder().fit(df) assert len(oe.categories_) == 1 assert_array_equal(oe.categories_[0][:3], ['a', 'b', 'c']) assert np.isnan(oe.categories_[0][-1]) df_trans = oe.transform(df) assert_allclose(df_trans, [[2.0], [0.0], [np.nan], [1.0], [0.0]]) X_inverse = oe.inverse_transform(df_trans) assert X_inverse.shape == (5, 1) assert_array_equal(X_inverse[:2, 0], ['c', 'a']) assert_array_equal(X_inverse[3:, 0], ['b', 'a']) assert np.isnan(X_inverse[2, 0])
def setup(): if not path.isfile(".data/30-days-of-ml.zip"): os.system("kaggle competitions download -c 30-days-of-ml") if not path.isdir(".data/30-days-of-ml/"): os.system("unzip .data/30-days-of-ml.zip -d .data/30-days-of-ml") # Load the training data train = pd.read_csv(".data/30-days-of-ml/train.csv", index_col=0) test = pd.read_csv(".data/30-days-of-ml/test.csv", index_col=0) # Preview the data train.head() # Separate target from features y = train['target'] features = train.drop(['target'], axis=1) # Preview features features.head() # List of categorical columns object_cols = [col for col in features.columns if 'cat' in col] # ordinal-encode categorical columns X = features.copy() X_test = test.copy() ordinal_encoder = OrdinalEncoder() X[object_cols] = ordinal_encoder.fit_transform(features[object_cols]) X_test[object_cols] = ordinal_encoder.transform(test[object_cols]) # Preview the ordinal-encoded features X.head() return X, y, X_test
def test_encoders_string_categories(input_dtype, category_dtype, array_type): """Check that encoding work with object, unicode, and byte string dtypes. Non-regression test for: https://github.com/scikit-learn/scikit-learn/issues/15616 https://github.com/scikit-learn/scikit-learn/issues/15726 https://github.com/scikit-learn/scikit-learn/issues/19677 """ X = np.array([["b"], ["a"]], dtype=input_dtype) categories = [np.array(["b", "a"], dtype=category_dtype)] ohe = OneHotEncoder(categories=categories, sparse=False).fit(X) X_test = _convert_container([["a"], ["a"], ["b"], ["a"]], array_type, dtype=input_dtype) X_trans = ohe.transform(X_test) expected = np.array([[0, 1], [0, 1], [1, 0], [0, 1]]) assert_allclose(X_trans, expected) oe = OrdinalEncoder(categories=categories).fit(X) X_trans = oe.transform(X_test) expected = np.array([[1], [1], [0], [1]]) assert_array_equal(X_trans, expected)
def load_data(path, path_on): df = pd.read_csv(path) df = precession(df) #分析了模型的feature_importance之后,删去了一些重要性很低的特征 #todo (待优化)缺失值填充成了empty, 它是原本数据的缺失值的填充字符,与原本数据保持一致,填充之后也作为特征的一种属性 df = df.fillna("empty") #线下训练数据 x = df.drop(['label', 'sid'], axis=1) y = df['label'] cols = x.columns #线上训练数据 df_on = pd.read_csv(path_on) df_on = precession(df_on) x_on = df_on.drop(['sid'], axis=1) x_on = x_on.fillna("empty") #线上线下数据融合,防止编码过程中出现线下数据有编码,线上数据无编码的情况 x_all = pd.concat([x, x_on], 0) print(x_all.shape) #把所有的字符编码成数字 oe = OrdinalEncoder() oe.fit(x_all) x = oe.transform(x) print(x.shape) return x, y, oe, cols
def _encode_feature(self, mat, feature_column): feat = mat[feature_column].to_numpy().reshape(-1, 1) enc = OrdinalEncoder() enc.fit(feat) self.feature_encoders[feature_column] = enc mat.loc[:, feature_column] = enc.transform(feat) return mat
def ordinal_encoder(params): train = params[0].astype('str') test = params[1].astype('str') oe = OrdinalEncoder() train = oe.fit_transform(train.reshape(-1, 1)) test = oe.transform(test.reshape(-1, 1)) return train.flatten(), test.flatten()
def new_data_encoding(self, types_dict={}): key_list = list(types_dict.keys()) ordinal_list = [] onehot_list = [] ordinal = OrdinalEncoder() onehot = OneHotEncoder() result = [] for key in key_list: if types_dict[key] == 0: ordinal_list.append(key) elif types_dict[key] == 1: onehot_list.append(key) print(ordinal_list) print(onehot_list) temp_o = self.data.loc[:, ordinal_list] if len(ordinal_list) == 1: ordinal.fit(temp_o.values.reshape(-1, 1)) self.data.loc[:, ordinal_list] = ordinal.transform( temp_o.values.reshape(-1, 1)) elif len(ordinal_list): ordinal.fit(temp_o) self.data.loc[:, ordinal_list] = ordinal.transform(temp_o) joblib.dump(ordinal, 'datahandle/ordinal.pkl') temp_hot = self.data.loc[:, onehot_list] if len(onehot_list) == 1: onehot.fit(temp_hot.values.reshape(-1, 1)) self.data.loc[:, onehot_list] = onehot.transform( temp_hot.values.reshape(-1, 1)) elif len(onehot_list): onehot.fit(temp_hot) result = onehot.transform(temp_hot).toarray() result = pd.DataFrame(result) columns = [] joblib.dump(onehot, 'datahandle/onehot.pkl') for l in onehot.categories_: columns = columns + list(l) result.columns = columns for i in range(len(onehot_list)): key = list(onehot.categories_[i]) temp = result.loc[:, key] pos = self.data.columns.get_loc(onehot_list[i]) data1 = self.data.iloc[:, 0:pos] data2 = self.data.iloc[:, pos + 1:] data1 = pd.concat([data1, temp], axis=1) self.data = pd.concat([data1, data2], axis=1) print(self.data)
def test_value_difference_metric_property(dtype, k, r, y_type, encode_label): # Check the property of the vdm distance. Let's check the property # described in "Improved Heterogeneous Distance Functions", D.R. Wilson and # T.R. Martinez, Journal of Artificial Intelligence Research 6 (1997) 1-34 # https://arxiv.org/pdf/cs/9701101.pdf # # "if an attribute color has three values red, green and blue, and the # application is to identify whether or not an object is an apple, red and # green would be considered closer than red and blue because the former two # both have similar correlations with the output class apple." # defined our feature X = np.array(["green"] * 10 + ["red"] * 10 + ["blue"] * 10).reshape(-1, 1) # 0 - not an apple / 1 - an apple y = np.array([1] * 8 + [0] * 5 + [1] * 7 + [0] * 9 + [1]) y_labels = np.array(["not apple", "apple"], dtype=object) y = y_labels[y] y = _convert_container(y, y_type) if encode_label: y = LabelEncoder().fit_transform(y) encoder = OrdinalEncoder(dtype=dtype) X_encoded = encoder.fit_transform(X) vdm = ValueDifferenceMetric(k=k, r=r) vdm.fit(X_encoded, y) sample_green = encoder.transform([["green"]]) sample_red = encoder.transform([["red"]]) sample_blue = encoder.transform([["blue"]]) for sample in (sample_green, sample_red, sample_blue): # computing the distance between a sample of the same category should # give a null distance dist = vdm.pairwise(sample).squeeze() assert dist == pytest.approx(0) # check the property explained in the introduction example dist_1 = vdm.pairwise(sample_green, sample_red).squeeze() dist_2 = vdm.pairwise(sample_blue, sample_red).squeeze() dist_3 = vdm.pairwise(sample_blue, sample_green).squeeze() # green and red are very close # blue is closer to red than green assert dist_1 < dist_2 assert dist_1 < dist_3 assert dist_2 < dist_3
def encode_column(encoding_type, col_name): if encoding_type=='label': le=LabelEncoder() le.fit(df[col_name]) title_order = list(le.classes_) df[col_name] = le.fit_transform(df[col_name]) print("Label Encoded") if encoding_type=='ordinal': oe=OrdinalEncoder() Ord = [["AssocProf",1],["AsstProf",0],["Prof",2]] oe.fit(Ord) title_order = Ord oe.transform(df[col_name]) print("Ordinal Encoded") return
class CatSklearnAttacker(PrivacyAttackerModel): """Base class for categorical attacker based on sklearn models. Attributes: key_type (CategoricalType): Required key attribute type (class_num or one_hot) by the learner. sensitive_type (CategoricalType): Required sensitive attribute type (class_num or one_hot) by the learner. skl_learner (Class): A (wrapped) sklearn classifier class that can be called with no arguments. """ KEY_TYPE = None SENSITIVE_TYPE = None SKL_LEARNER = None def __init__(self): self.predictor = self.SKL_LEARNER() self.key_processor = OrdinalEncoder() if self.KEY_TYPE == CategoricalType.CLASS_NUM \ else OneHotEncoder() self.sensitive_processor = OrdinalEncoder() if \ self.SENSITIVE_TYPE == CategoricalType.CLASS_NUM else OneHotEncoder() def fit(self, synthetic_data, key, sensitive): key_table = allow_nan(synthetic_data[key]) sensitive_table = allow_nan(synthetic_data[sensitive]) self.key_processor.fit(key_table) self.sensitive_processor.fit(sensitive_table) key_train = self.key_processor.transform(key_table) sensitive_train = self.sensitive_processor.transform(sensitive_table) self.predictor.fit(key_train, sensitive_train) def predict(self, key_data): keys = allow_nan_array(key_data) # de-nan key attributes try: # key attributes in ML ready format keys_transform = self.key_processor.transform([keys]) except ValueError: # Some attributes of the input haven't appeared in synthetic tables return None sensitive_pred = self.predictor.predict(keys_transform) if len(np.array(sensitive_pred).shape) == 1: sensitive_pred = [sensitive_pred] # predicted sensitive attributes in original format sensitives = self.sensitive_processor.inverse_transform(sensitive_pred) return tuple(sensitives[0])
def encode_batters(data): data[['BAT_ID', 'BASE1_RUN_ID', 'BASE2_RUN_ID', 'BASE3_RUN_ID']] = \ data[['BAT_ID', 'BASE1_RUN_ID', 'BASE2_RUN_ID', 'BASE3_RUN_ID']]\ .fillna('') batters = np.unique( data[['BAT_ID', 'BASE1_RUN_ID', 'BASE2_RUN_ID', 'BASE3_RUN_ID']].values.reshape(-1)) encoder = OrdinalEncoder().fit(batters.reshape(-1, 1)) data['BAT_ID'] = encoder.transform(data['BAT_ID'].values.reshape( -1, 1)).reshape(-1).astype(int) data['BASE1_RUN_ID'] = encoder.transform( data['BASE1_RUN_ID'].values.reshape(-1, 1)).reshape(-1).astype(int) data['BASE2_RUN_ID'] = encoder.transform( data['BASE2_RUN_ID'].values.reshape(-1, 1)).reshape(-1).astype(int) data['BASE3_RUN_ID'] = encoder.transform( data['BASE3_RUN_ID'].values.reshape(-1, 1)).reshape(-1).astype(int) return data
def test_ordinalencoder(): X0 = [["Male", 1], ["Female", 3], ["Female", 2]] X1 = [["Male", 1], ["Female", 27], ["Bananas", 2]] for X in [X0, X1]: ohe = OrdinalEncoder() ohe.fit(X) ohe_ = convert_estimator(ohe) assert np.allclose(ohe.transform(X), ohe_.transform(X))
def ordinal_encode_df(df, encoder=None): """ Transform the object categories by means of ordinal encoding""" if encoder is None: ordinal_enc = OrdinalEncoder().fit( df.select_dtypes('object').replace(np.nan, 'nan')) else: ordinal_enc = encoder object_ordinals = pd.DataFrame( ordinal_enc.transform( df.select_dtypes('object').replace(np.nan, 'nan'))).astype('int') # we put back in the nan values nan_list = ordinal_enc.transform(np.array([['nan', 'nan', 'nan']])) for i in range(3): object_ordinals.iloc[:, i] = object_ordinals.iloc[:, i].replace( nan_list[0, i], np.nan) for i, col in enumerate(df.select_dtypes('object').columns): df[col] = object_ordinals.iloc[:, i]
def encodeOrdinal(data, col_names): # creating instance of encoder ordinal_encoder = OrdinalEncoder() # Assigning numerical values and storing in another column ordinal_encoder.fit(data[col_names]) data[col_names] = ordinal_encoder.transform(data[col_names]) return data, ordinal_encoder
def loadCar(): data = pd.read_csv('car.csv') X, y = data.values[:1401, 0:6], data.values[:1401, 6] finalTestX, finalTestY = data.values[1401:, 0:6], data.values[1401:, 6] print("Size of car data: ", len(X)) enc = OrdinalEncoder() enc.fit(X) X = enc.transform(X) finalTestX = enc.transform(finalTestX) le = LabelEncoder() le.fit(y) y = le.transform(y) finalTestY = le.transform(finalTestY) return X, y, finalTestX, finalTestY
def test_ordinal_encoder_handle_unknown_string_dtypes(X_train, X_test): """Checks that ordinal encoder transforms string dtypes. Non-regression test for #19872.""" enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-9) enc.fit(X_train) X_trans = enc.transform(X_test) assert_allclose(X_trans, [[-9, 0]])
def id3_adapret(**kwargs): # getting data from kwargs train = kwargs['train'] test = kwargs['test'] t = kwargs['tolorance'] # bulding encoder merged_data = pd.concat([train,test]) merged_data_without_class = merged_data.drop('class',1) encoder = OrdinalEncoder() encoder.fit(merged_data_without_class) # seperating classification column from datasets train_without_class= train.drop('class',1) test_without_class = test.drop('class',1) train_classifications = train['class'] test_classifications = test['class'] # encoding them all encoded_train_without_class = encoder.transform(train_without_class) encoded_test_without_class = encoder.transform(test_without_class) encoded_train_classifications = train_classifications.map({'yes':1,'no':0}) encoded_test_classifications = test_classifications.map({'yes':1,'no':0}) # building classification tree clf = DecisionTreeClassifier(criterion="entropy", max_depth=t) clf.fit(encoded_train_without_class, encoded_train_classifications) # pridicting with the tree predictions = clf.predict(encoded_test_without_class) # buildding matrix and cakculating score correct = 0 TP,TN,FP,FN = 0,0,0,0 for classif, predic in zip(encoded_test_classifications, predictions): if (classif == predic): correct += 1 if (classif == 1 and predic == 1 ): TP = TP + 1 if (classif == 0 and predic == 0 ): TN = TN + 1 if (classif == 0 and predic == 1 ): FP = FP + 1 if (classif == 1 and predic == 0 ): FN = FN + 1 total = len(predictions) # returning dict acording to the dapter return {'score':( correct / total ) *100, 'TP':TP, 'TN':TN, 'FP':FP, 'FN':FN}
def clean_data(data: DataFrame): columns = ['form_field1', 'form_field2', 'form_field3', 'form_field4', 'form_field5', 'form_field6', 'form_field7', 'form_field8', 'form_field9', 'form_field10', 'form_field12', 'form_field13', 'form_field14', 'form_field16', 'form_field17', 'form_field18', 'form_field19', 'form_field20', 'form_field21', 'form_field22', 'form_field24', 'form_field25', 'form_field26', 'form_field27', 'form_field28', 'form_field29', 'form_field32', 'form_field33', 'form_field34', 'form_field36', 'form_field37', 'form_field38', 'form_field39', 'form_field42', 'form_field43', 'form_field44', 'form_field46', 'form_field47', 'form_field48', 'form_field49', 'form_field50'] categories = [array(['charge', 'lending'], dtype=object)] df = data[columns] enc = OrdinalEncoder() enc.categories_ = categories df.form_field47 = enc.transform(df.form_field47.to_frame()) return df
def test_ordinal_encoder_handle_unknowns_nan(): # Make sure unknown_value=np.nan properly works enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan) X_fit = np.array([[1], [2], [3]]) enc.fit(X_fit) X_trans = enc.transform([[1], [2], [4]]) assert_array_equal(X_trans, [[0], [1], [np.nan]])
def test_ordinal_encoder_handle_missing_and_unknown(X, expected_X_trans, X_test): """Test the interaction between missing values and handle_unknown""" oe = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1) X_trans = oe.fit_transform(X) assert_allclose(X_trans, expected_X_trans) assert_allclose(oe.transform(X_test), [[-1.0]])