def test_one_hot_encoder_sparse(): """Test OneHotEncoder's fit and transform.""" X = [[3, 2, 1], [0, 1, 1]] enc = OneHotEncoder() # discover max values automatically X_trans = enc.fit_transform(X).toarray() assert_equal(X_trans.shape, (2, 5)) assert_array_equal(enc.active_features_, np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0]) assert_array_equal(enc.feature_indices_, [0, 4, 7, 9]) # check outcome assert_array_equal(X_trans, [[0., 1., 0., 1., 1.], [1., 0., 1., 0., 1.]]) # max value given as 3 enc = OneHotEncoder(n_values=4) X_trans = enc.fit_transform(X) assert_equal(X_trans.shape, (2, 4 * 3)) assert_array_equal(enc.feature_indices_, [0, 4, 8, 12]) # max value given per feature enc = OneHotEncoder(n_values=[3, 2, 2]) X = [[1, 0, 1], [0, 1, 1]] X_trans = enc.fit_transform(X) assert_equal(X_trans.shape, (2, 3 + 2 + 2)) assert_array_equal(enc.n_values_, [3, 2, 2]) # check that testing with larger feature works: X = np.array([[2, 0, 1], [0, 1, 1]]) enc.transform(X) # test that an error is raised when out of bounds: X_too_large = [[0, 2, 1], [0, 1, 1]] assert_raises(ValueError, enc.transform, X_too_large) assert_raises(ValueError, OneHotEncoder(n_values=2).fit_transform, X) # test that error is raised when wrong number of features assert_raises(ValueError, enc.transform, X[:, :-1]) # test that error is raised when wrong number of features in fit # with prespecified n_values assert_raises(ValueError, enc.fit, X[:, :-1]) # test exception on wrong init param assert_raises(TypeError, OneHotEncoder(n_values=np.int).fit, X) enc = OneHotEncoder() # test negative input to fit assert_raises(ValueError, enc.fit, [[0], [-1]]) # test negative input to transform enc.fit([[0], [1]]) assert_raises(ValueError, enc.transform, [[0], [-1]])
def load_UCI_Credit_Card_data(infile=None, balanced=True, seed=5): X = [] y = [] sids = [] with open(infile, "r") as fi: fi.readline() reader = csv.reader(fi) for row in reader: sids.append(row[0]) X.append(row[1:-1]) y0 = int(row[-1]) if y0 == 0: y0 = -1 y.append(y0) y = np.array(y) if balanced: X, y = balance_X_y(X, y, seed) X = np.array(X, dtype=np.float32) y = np.array(y, dtype=np.float32) encoder = OneHotEncoder(categorical_features=[1, 2, 3]) encoder.fit(X) X = encoder.transform(X).toarray() X, y = shuffle_X_y(X, y, seed) scale_model = StandardScaler() X = scale_model.fit_transform(X) return X, np.expand_dims(y, axis=1)
def train(self, X, Y, class_number=-1): class_count = max(np.unique(Y).size, class_number) feature_count = X.shape[1] self.__hpelm = ELM(feature_count, class_count, 'wc') self.__hpelm.add_neurons(feature_count, "sigm") Y_arr = Y.reshape(-1, 1) enc = OneHotEncoder() enc.fit(Y_arr) Y_OHE = enc.transform(Y_arr).toarray() out_fd = sys.stdout sys.stdout = open(os.devnull, 'w') self.__hpelm.train(X, Y_OHE) sys.stdout = out_fd
def buildModel(self, X_train_d, X_train_c, X_test_d, X_test_c, y_train, y_test): ''' 开始构建模型 Args: X_train_d: 离散特征训练数据 X_train_c: 连续特征训练数据 X_test_d: 离散特征测试数据 X_test_c: 连续特征测试数据 y_train: 训练数据标记 {-1, 1} y_test: 测试数据标记 {-1, 1} Returns: gbc_enc: GBDT OneHotEncoder gbc: GBDT模型 comb_model: 训练得到的组合模型 threshold: 正负样例阈值, Pred_Prob >= threshold 为正样例; Pred_Prob < threshold 为负样例 comb_model_auc: 模型AUC precision: 模型精度 recall: 模型召回率 ''' if self._random_state is not None: gbc = GradientBoostingClassifier(n_estimators=self._n_estimators, learning_rate=self._gbdt_learning_rate, max_depth=self._max_depth, random_state=self._random_state).fit(X_train_c, y_train) else: gbc = GradientBoostingClassifier(n_estimators=self._n_estimators, learning_rate=self._gbdt_learning_rate, max_depth=self._max_depth).fit(X_train_c, y_train) X_train_leaves = gbc.apply(X_train_c)[:,:,0] X_test_leaves = gbc.apply(X_test_c)[:,:,0] (X_train_rows, cols) = X_train_leaves.shape gbc_enc = OneHotEncoder().fit(np.concatenate([X_train_leaves,X_test_leaves], axis = 0)) X_trans = gbc_enc.transform(np.concatenate([X_train_leaves,X_test_leaves], axis = 0)) X_train_ext = hstack([X_trans[:X_train_rows,:], X_train_d]) X_test_ext = hstack([X_trans[X_train_rows:,:], X_test_d]) log.debug("Combine features done.") comb_model = LogisticRegression().fit(X_train_ext, y_train) log.debug("Training done.") comb_model_pred = comb_model.predict_proba(X_test_ext)[:,1] precision, recall, thresholds = precision_recall_curve(y_test, comb_model_pred) ap = average_precision_score(y_test, comb_model_pred) recall_meet = recall >= self._recall_rate recall_meet_min = len([item for item in recall_meet if item == True]) threshold = thresholds[recall_meet_min-1] log.debug("threshold: %f - precision: %f - recall: %f", threshold, precision[recall_meet_min-1], recall[recall_meet_min-1]) comb_model_auc = roc_auc_score(y_test, comb_model_pred) log.debug("AUC score is: %f", comb_model_auc) return gbc_enc, gbc, comb_model, threshold, comb_model_auc, precision[recall_meet_min-1], recall[recall_meet_min-1]
def test_one_hot_encoder_unknown_transform(): X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]]) y = np.array([[4, 1, 1]]) # Test that one hot encoder raises error for unknown features # present during transform. oh = OneHotEncoder(handle_unknown='error') oh.fit(X) assert_raises(ValueError, oh.transform, y) # Test the ignore option, ignores unknown features. oh = OneHotEncoder(handle_unknown='ignore') oh.fit(X) assert_array_equal( oh.transform(y).toarray(), np.array([[0., 0., 0., 0., 1., 0., 0.]])) # Raise error if handle_unknown is neither ignore or error. oh = OneHotEncoder(handle_unknown='42') oh.fit(X) assert_raises(ValueError, oh.transform, y)
def test_one_hot_encoder_unknown_transform(): X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]]) y = np.array([[4, 1, 1]]) # Test that one hot encoder raises error for unknown features # present during transform. oh = OneHotEncoder(handle_unknown='error') oh.fit(X) assert_raises(ValueError, oh.transform, y) # Test the ignore option, ignores unknown features. oh = OneHotEncoder(handle_unknown='ignore') oh.fit(X) assert_array_equal( oh.transform(y).toarray(), np.array([[ 0., 0., 0., 0., 1., 0., 0.]]) ) # Raise error if handle_unknown is neither ignore or error. oh = OneHotEncoder(handle_unknown='42') oh.fit(X) assert_raises(ValueError, oh.transform, y)
def _run_one_hot(X, X2, cat): enc = OneHotEncoder(categorical_features=cat) Xtr = enc.fit_transform(X) X2tr = enc.transform(X2) return Xtr, X2tr