def test_one_hot_encoder_sparse(): """Test OneHotEncoder's fit and transform.""" X = [[3, 2, 1], [0, 1, 1]] enc = OneHotEncoder() # discover max values automatically X_trans = enc.fit_transform(X).toarray() assert_equal(X_trans.shape, (2, 5)) assert_array_equal(enc.active_features_, np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0]) assert_array_equal(enc.feature_indices_, [0, 4, 7, 9]) # check outcome assert_array_equal(X_trans, [[0., 1., 0., 1., 1.], [1., 0., 1., 0., 1.]]) # max value given as 3 enc = OneHotEncoder(n_values=4) X_trans = enc.fit_transform(X) assert_equal(X_trans.shape, (2, 4 * 3)) assert_array_equal(enc.feature_indices_, [0, 4, 8, 12]) # max value given per feature enc = OneHotEncoder(n_values=[3, 2, 2]) X = [[1, 0, 1], [0, 1, 1]] X_trans = enc.fit_transform(X) assert_equal(X_trans.shape, (2, 3 + 2 + 2)) assert_array_equal(enc.n_values_, [3, 2, 2]) # check that testing with larger feature works: X = np.array([[2, 0, 1], [0, 1, 1]]) enc.transform(X) # test that an error is raised when out of bounds: X_too_large = [[0, 2, 1], [0, 1, 1]] assert_raises(ValueError, enc.transform, X_too_large) assert_raises(ValueError, OneHotEncoder(n_values=2).fit_transform, X) # test that error is raised when wrong number of features assert_raises(ValueError, enc.transform, X[:, :-1]) # test that error is raised when wrong number of features in fit # with prespecified n_values assert_raises(ValueError, enc.fit, X[:, :-1]) # test exception on wrong init param assert_raises(TypeError, OneHotEncoder(n_values=np.int).fit, X) enc = OneHotEncoder() # test negative input to fit assert_raises(ValueError, enc.fit, [[0], [-1]]) # test negative input to transform enc.fit([[0], [1]]) assert_raises(ValueError, enc.transform, [[0], [-1]])
def xgboost_lr_model(train, label): # 训练/测试数据分割 X_train, X_test, y_train, y_test = train_test_split(train, label, test_size=0.3, random_state=42) # 定义xgb模型 xgboost = xgb.XGBClassifier(nthread=4, learning_rate=0.08, n_estimators=50, max_depth=5, gamma=0, subsample=0.9, colsample_bytree=0.5) # 训练xgb学习 xgboost.fit(X_train, y_train) # 预测xgb及AUC评测 y_pred_test = xgboost.predict_proba(X_test)[:, 1] xgb_test_auc = roc_auc_score(y_test, y_pred_test) print('xgboost test auc: %.5f' % xgb_test_auc) # xgboost编码原有特征 X_train_leaves = xgboost.apply(X_train) X_test_leaves = xgboost.apply(X_test) # 合并编码后的训练数据和测试数据 All_leaves = np.concatenate((X_train_leaves, X_test_leaves), axis=0) All_leaves = All_leaves.astype(np.int32) # 对所有特征进行ont-hot编码 xgbenc = OneHotEncoder() X_trans = xgbenc.fit_transform(All_leaves) (train_rows, cols) = X_train_leaves.shape # 定义LR模型 lr = LogisticRegression() # lr对xgboost特征编码后的样本模型训练 lr.fit(X_trans[:train_rows, :], y_train) # 预测及AUC评测 y_pred_xgblr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] xgb_lr_auc1 = roc_auc_score(y_test, y_pred_xgblr1) print('基于Xgb特征编码后的LR AUC: %.5f' % xgb_lr_auc1) # 定义LR模型 lr = LogisticRegression(n_jobs=-1) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], X_train]) X_test_ext = hstack([X_trans[train_rows:, :], X_test]) # lr对组合特征的样本模型训练 lr.fit(X_train_ext, y_train) # 预测及AUC评测 y_pred_xgblr2 = lr.predict_proba(X_test_ext)[:, 1] xgb_lr_auc2 = roc_auc_score(y_test, y_pred_xgblr2) print('基于组合特征的LR AUC: %.5f' % xgb_lr_auc2)
def gbdt_lr_train(libsvmFileName): # load样本数据 X_all, y_all = load_svmlight_file(libsvmFileName) # 训练/测试数据分割 X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = 0.3, random_state = 42) # 定义GBDT模型 gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0,max_features=0.5) # 训练学习 gbdt.fit(X_train, y_train) # 预测及AUC评测 y_pred_gbdt = gbdt.predict_proba(X_test.toarray())[:, 1] gbdt_auc = roc_auc_score(y_test, y_pred_gbdt) print('gbdt auc: %.5f' % gbdt_auc) # lr对原始特征样本模型训练 lr = LogisticRegression() lr.fit(X_train, y_train) # 预测及AUC评测 y_pred_test = lr.predict_proba(X_test)[:, 1] lr_test_auc = roc_auc_score(y_test, y_pred_test) print('基于原有特征的LR AUC: %.5f' % lr_test_auc) # GBDT编码原有特征 X_train_leaves = gbdt.apply(X_train)[:,:,0] X_test_leaves = gbdt.apply(X_test)[:,:,0] # 对所有特征进行ont-hot编码 (train_rows, cols) = X_train_leaves.shape gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0)) # 定义LR模型 lr = LogisticRegression() # lr对gbdt特征编码后的样本模型训练 lr.fit(X_trans[:train_rows, :], y_train) # 预测及AUC评测 y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1) print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1) # 定义LR模型 lr = LogisticRegression(n_jobs=-1) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], X_train]) X_test_ext = hstack([X_trans[train_rows:, :], X_test]) print(X_train_ext.shape) # lr对组合特征的样本模型训练 lr.fit(X_train_ext, y_train) # 预测及AUC评测 y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1] gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2) print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)
def Predict(self, X_test): X_test_leaves = self.gbdt.apply(X_test)[:, :, 0] gbdtenc = OneHotEncoder() self.X_trans = gbdtenc.fit_transform(np.concatenate((self.X_train_leaves, X_test_leaves), axis=0)) X_test_ext = hstack([self.X_trans[self.train_rows:, :], X_test]) y_pred_gbdtlr2 = self.lr.predict_proba(X_test_ext)[:, 1] values = [] for value in y_pred_gbdtlr2: values.append(value) return values
def test_one_hot_encoder_dense(): # check for sparse=False X = [[3, 2, 1], [0, 1, 1]] enc = OneHotEncoder(sparse=False) # discover max values automatically X_trans = enc.fit_transform(X) assert_equal(X_trans.shape, (2, 5)) assert_array_equal(enc.active_features_, np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0]) assert_array_equal(enc.feature_indices_, [0, 4, 7, 9]) # check outcome assert_array_equal(X_trans, np.array([[0., 1., 0., 1., 1.], [1., 0., 1., 0., 1.]]))
def construct_features(self): ''' Construct features. ''' # Parse date features. print "Parsing date features" parsed_train_X = self.parse_date_feature(self.train_x[:, 0]) parsed_test_X = self.parse_date_feature(self.test_x[:, 0]) # Parse other features. print "Parsing all features" total_train = len(self.train_x) total_test = len(self.test_x) for index_feature in range(1, len(self.train_x[0])): print "Processing feature ", index_feature # Check if we have a categorical feature. labels = np.unique(self.train_x[:, index_feature]) # If we have string or binary labels, we have a categorical feature. if type(self.train_x[0, index_feature]) == np.str or len(labels) == 2: # We have a categorical feature. # Encode it in the one hot format. original_data = np.hstack((self.train_x[:, index_feature], self.test_x[:, index_feature])) label_encoder = LabelEncoder() data_label_encoded = label_encoder.fit_transform(original_data) encoder = OneHotEncoder() data_encoded = encoder.fit_transform(data_label_encoded.reshape((len(data_label_encoded), 1))) data_encoded = np.asarray(data_encoded.todense()).astype(np.bool) # Add encoded feature to data. parsed_train_X = np.hstack((parsed_train_X, data_encoded[0:total_train, :])) parsed_test_X = np.hstack((parsed_test_X, data_encoded[total_train:, :])) del data_encoded else: # We have a numeric feature. # Just add it to the data. parsed_train_X = np.hstack((parsed_train_X, self.train_x[:, index_feature].reshape((total_train, 1)))) parsed_test_X = np.hstack((parsed_test_X, self.test_x[:, index_feature].reshape((total_test, 1)))) self.train_x = parsed_train_X self.test_x = parsed_test_X
def test_one_hot_encoder_dense(): """check for sparse=False""" X = [[3, 2, 1], [0, 1, 1]] enc = OneHotEncoder(sparse=False) # discover max values automatically X_trans = enc.fit_transform(X) assert_equal(X_trans.shape, (2, 5)) assert_array_equal(enc.active_features_, np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0]) assert_array_equal(enc.feature_indices_, [0, 4, 7, 9]) # check outcome assert_array_equal(X_trans, np.array([[0., 1., 0., 1., 1.], [1., 0., 1., 0., 1.]]))
def xgboost_lr_train(): path = ['data_11_02.csv', 'data_11_03.csv'] data, label = import_data([], [], path) X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.1, random_state=42) xgboost = xgb.XGBClassifier(nthread=4, learning_rate=0.02, min_child_weight=5, tree_method='gpu_hist', n_estimators=500, max_depth=5) global model_xgb, model_tmp, model_lr model_xgb = xgboost.fit(X_train, y_train) y_pred_test = xgboost.predict_proba(X_test)[:, 1] xgb_test_auc = roc_auc_score(y_test, y_pred_test) print('xgboost test auc: %.5f' % xgb_test_auc) X_train_leaves = xgboost.apply(X_train) X_test_leaves = xgboost.apply(X_test) All_leaves = np.concatenate((X_train_leaves, X_test_leaves), axis=0) All_leaves = All_leaves.astype(np.int32) xgbenc = OneHotEncoder() X_trans = xgbenc.fit_transform(All_leaves) (train_rows, cols) = X_train_leaves.shape lr = LogisticRegression() model_tmp = lr.fit(X_trans[:train_rows, :], y_train) y_pred_xgblr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] xgb_lr_auc1 = roc_auc_score(y_test, y_pred_xgblr1) print('LR based on XGB AUC: %.5f' % xgb_lr_auc1) lr = LogisticRegression(n_jobs=-1) X_train_ext = hstack([X_trans[:train_rows, :], X_train]) X_test_ext = hstack([X_trans[train_rows:, :], X_test]) model_lr = lr.fit(X_train_ext, y_train) joblib.dump(model_lr, 'lr') y_pred_xgblr2 = lr.predict_proba(X_test_ext)[:, 1] xgb_lr_auc2 = roc_auc_score(y_test, y_pred_xgblr2) print('AUC bsed on combined features of lr: %.5f' % xgb_lr_auc2)
def gbdt_lr_train(self, Train_tab, Train_libsvm): # load样本数据 X_all, y_all = load_svmlight_file("sample_libsvm_data.txt") # 训练/测试数据分割 X_train, X_test, y_train, y_test = train_test_split(Train_libsvm, Train_tab, test_size=0.1, random_state=42) # 定义GBDT模型 self.gbdt.fit(X_train, y_train) # GBDT编码原有特征 self.X_train_leaves = self.gbdt.apply(X_train)[:, :, 0] X_test_leaves = self.gbdt.apply(X_test)[:, :, 0] # 对所有特征进行ont-hot编码 (self.train_rows, cols) = self.X_train_leaves.shape gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform(np.concatenate((self.X_train_leaves, X_test_leaves), axis=0)) X_train_ext = hstack([X_trans[:self.train_rows, :], X_train]) # lr对组合特征的样本模型训练 self.lr.fit(X_train_ext, y_train)
def gbdt_lr_train(self, Train_tab, Train_libsvm): # load样本数据 X_all, y_all = load_svmlight_file("sample_libsvm_data.txt") # 训练/测试数据分割 X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.1, random_state=42) # 定义GBDT模型 gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0, max_features=0.5) # 训练模型 gbdt.fit(X_train, y_train) # GBDT编码原有特征 X_train_leaves = gbdt.apply(X_train)[:, :, 0] X_test_leaves = gbdt.apply(X_test)[:, :, 0] # 对所有特征进行ont-hot编码 (train_rows, cols) = X_train_leaves.shape gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform( np.concatenate((X_train_leaves, X_test_leaves), axis=0)) # 定义LR模型 lr = LogisticRegression(n_jobs=-1) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], X_train]) X_test_ext = hstack([X_trans[train_rows:, :], X_test]) # lr对组合特征的样本模型训练 lr.fit(X_train_ext, y_train) # 预测及AUC评测 filename = 'finalized_model.sav' pickle.dump(lr, open(filename, 'wb')) # load the model from disk loaded_model = pickle.load(open(filename, 'rb')) y_pred_gbdtlr2 = loaded_model.predict_proba(X_test_ext)[:, 1] print(y_pred_gbdtlr2)
def gbdt_lr_train(): cv_lr_scores = [] cv_lr_trans_scores = [] cv_lr_trans_raw_scores = [] cv_gbdt_scores = [] skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019) for train_index, valid_index in skf.split(X, y): X_train = X[train_index] X_valid = X[valid_index] y_train = y[train_index] y_valid = y[valid_index] # 定义GBDT模型 gbdt = GradientBoostingClassifier(n_estimators=60, max_depth=3, verbose=0, max_features=0.5) # 训练学习 gbdt.fit(X_train, y_train) y_pred_gbdt = gbdt.predict_proba(X_valid)[:, 1] gbdt_auc = roc_auc_score(y_valid, y_pred_gbdt) print('基于原有特征的gbdt auc: %.5f' % gbdt_auc) cv_gbdt_scores.append(gbdt_auc) # lr对原始特征样本模型训练 lr = LogisticRegression() lr.fit(X_train, y_train) # 预测及AUC评测 y_pred_test = lr.predict_proba(X_valid)[:, 1] lr_valid_auc = roc_auc_score(y_valid, y_pred_test) print('基于原有特征的LR AUC: %.5f' % lr_valid_auc) cv_lr_scores.append(lr_valid_auc) # GBDT编码原有特征 X_train_leaves = gbdt.apply(X_train)[:, :, 0] X_valid_leaves = gbdt.apply(X_valid)[:, :, 0] # 对所有特征进行ont-hot编码 (train_rows, cols) = X_train_leaves.shape gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_valid_leaves), axis=0)) # 定义LR模型 lr = LogisticRegression() # lr对gbdt特征编码后的样本模型训练 lr.fit(X_trans[:train_rows, :], y_train) # 预测及AUC评测 y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] gbdt_lr_auc1 = roc_auc_score(y_valid, y_pred_gbdtlr1) print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1) cv_lr_trans_scores.append(gbdt_lr_auc1) # 定义LR模型 lr = LogisticRegression(n_jobs=-1) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], X_train]) X_valid_ext = hstack([X_trans[train_rows:, :], X_valid]) print(X_train_ext.shape) # lr对组合特征的样本模型训练 lr.fit(X_train_ext, y_train) # 预测及AUC评测 y_pred_gbdtlr2 = lr.predict_proba(X_valid_ext)[:, 1] gbdt_lr_auc2 = roc_auc_score(y_valid, y_pred_gbdtlr2) print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2) cv_lr_trans_raw_scores.append(gbdt_lr_auc2) cv_lr = np.mean(cv_lr_scores) cv_lr_trans = np.mean(cv_lr_trans_scores) cv_lr_trans_raw = np.mean(cv_lr_trans_raw_scores) cv_gbdt = np.mean(cv_gbdt_scores) print("==" * 20) print("gbdt原始特征cv_gbdt:", cv_gbdt) print("lr原始特征cv_lr:", cv_lr) print("lr基于gbdt的特征cv_lr_trans:", cv_lr_trans) print("lr基于gbdt特征个原始特征cv_lr_trans_raw:", cv_lr_trans_raw)
def xgb_lr_train(): cv_lr_scores = [] cv_lr_trans_scores = [] cv_lr_trans_raw_scores = [] cv_xgb_scores = [] skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019) for train_index, valid_index in skf.split(X, y): X_train = X[train_index] X_valid = X[valid_index] y_train = y[train_index] y_valid = y[valid_index] # 定义xgb模型 xgboost = xgb.XGBClassifier(nthread=4, learning_rate=0.08, n_estimators=100, max_depth=4, gamma=0, subsample=0.7, colsample_bytree=0.7, verbosity=1) # 训练学习 xgboost.fit(X_train, y_train) y_pred_valid = xgboost.predict_proba(X_valid)[:, 1] xgb_valid_auc = roc_auc_score(y_valid, y_pred_valid) print('基于原有特征的xgb auc: %.5f' % xgb_valid_auc) cv_xgb_scores.append(xgb_valid_auc) # xgboost编码原有特征 X_train_leaves = xgboost.apply(X_train) X_valid_leaves = xgboost.apply(X_valid) # 合并编码后的训练数据和测试数据 All_leaves = np.concatenate((X_train_leaves, X_valid_leaves), axis=0) All_leaves = All_leaves.astype(np.int32) # 对所有特征进行ont-hot编码 xgbenc = OneHotEncoder() X_trans = xgbenc.fit_transform(All_leaves) (train_rows, cols) = X_train_leaves.shape # 定义LR模型 lr = LogisticRegression() # lr对xgboost特征编码后的样本模型训练 lr.fit(X_trans[:train_rows, :], y_train) # 预测及AUC评测 y_pred_xgblr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] xgb_lr_auc1 = roc_auc_score(y_valid, y_pred_xgblr1) print('基于Xgb特征编码后的LR AUC: %.5f' % xgb_lr_auc1) cv_lr_trans_scores.append(xgb_lr_auc1) # 定义LR模型 lr = LogisticRegression(n_jobs=-1) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], X_train]) X_test_ext = hstack([X_trans[train_rows:, :], X_valid]) # lr对组合特征的样本模型训练 lr.fit(X_train_ext, y_train) # 预测及AUC评测 y_pred_xgblr2 = lr.predict_proba(X_test_ext)[:, 1] xgb_lr_auc2 = roc_auc_score(y_valid, y_pred_xgblr2) print('基于组合特征的LR AUC: %.5f' % xgb_lr_auc2) cv_lr_trans_raw_scores.append(xgb_lr_auc2) cv_lr_trans = np.mean(cv_lr_trans_scores) cv_lr_trans_raw = np.mean(cv_lr_trans_raw_scores) cv_xgb = np.mean(cv_xgb_scores) print("==" * 20) print("xgb原始特征cv_gbdt:", cv_xgb) print("lr基于xgb的特征cv_lr_trans:", cv_lr_trans) print("lr基于xgb特征个原始特征cv_lr_trans_raw:", cv_lr_trans_raw)
def xgboost_lr_train(libsvmTrain, libsvmTest): # 训练/测试数据分割 X_train, y_train = load_svmlight_file(libsvmTrain, n_features=1491, offset=1) X_test, y_test = load_svmlight_file(libsvmTest, n_features=1491, offset=1) print("train top5 example:\n") print_example4cpp(X_train, 5, 0) print("test top5 example:\n") print_example4cpp(X_test, 5, 0) # 定义xgb模型 # nthread = 4, gamma = 0, subsample = 0.9, colsample_bytree = 0.5, xgboost = xgb.XGBClassifier( learning_rate=0.1, n_estimators=100, max_depth=3, max_leaf_nodes=10, # eval_metric="auc", missing=-999 ) # xgboost = xgb.XGBRegressor(nthread=4, learning_rate=1, # n_estimators=2, max_depth=3, gamma=0, subsample=0.9, colsample_bytree=0.5) # 训练xgb学习 xgboost.fit(X_train, y_train) xgboost.save_model("../data/test1.model") cp_model = xgb.Booster(model_file='../data/test1.model') cp_model.dump_model("../data/test1.raw.txt") # 预测xgb及AUC评测 y_pred_test1 = xgboost.predict_proba(X_test)[:, 1] # for classifier # y_pred_test2 = xgboost.predict(X_test) xgb_test_auc = roc_auc_score(y_test, y_pred_test1) print('xgboost test auc: %.5f' % xgb_test_auc) print("test top5 pred1: {}".format(list(zip(y_test[:5], y_pred_test1[:5])))) # print("test top5 pred2: {}".format(list(zip(y_test[:5], y_pred_test2[:5])))) # xgboost编码原有特征 X_train_leaves = xgboost.apply(X_train) X_test_leaves = xgboost.apply(X_test) # 合并编码后的训练数据和测试数据 All_leaves = np.concatenate((X_train_leaves, X_test_leaves), axis=0) All_leaves = All_leaves.astype(np.int32) print("X_train leaves index: \n{}".format(X_train_leaves[:5])) # 对所有特征进行ont-hot编码 xgbenc = OneHotEncoder() X_trans = xgbenc.fit_transform(All_leaves) print("X_train top5 onehot encode:") for i in range(5): print(X_trans[i].toarray()) (train_rows, cols) = X_train_leaves.shape print("\nnew x_train shape for lr: {}".format(X_train_leaves.shape)) # 定义LR模型 lr = LogisticRegression() # lr对xgboost特征编码后的样本模型训练 lr.fit(X_trans[:train_rows, :], y_train) # 预测及AUC评测 y_pred_xgblr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] xgb_lr_auc1 = roc_auc_score(y_test, y_pred_xgblr1) print('基于Xgb特征编码后的LR AUC: %.5f' % xgb_lr_auc1) # 定义LR模型 lr = LogisticRegression(n_jobs=-1, max_iter=100) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], X_train]) X_test_ext = hstack([X_trans[train_rows:, :], X_test]) # lr对组合特征的样本模型训练 lr.fit(X_train_ext, y_train) # 预测及AUC评测 y_pred_xgblr2 = lr.predict_proba(X_test_ext)[:, 1] xgb_lr_auc2 = roc_auc_score(y_test, y_pred_xgblr2) print('基于组合特征的LR AUC: %.5f' % xgb_lr_auc2)
def gbdt_lr_train(train,test,gbdt_features,lr_features,target,name,isOnline): # 定义GBDT模型 gbdt = GradientBoostingClassifier(n_estimators=20, max_depth=3, verbose=0, max_features=0.3) #n_estimators=20, max_depth=3, verbose=0, max_features=0.5 # 训练学习 gbdt.fit(train[gbdt_features], train[target]) # 预测及AUC评测 if isOnline == False: y_pred_gbdt = gbdt.predict_proba(test[gbdt_features])[:, 1] gbdt_test_log_loss = log_loss(test[target], y_pred_gbdt) print('gbdt log_loss: %.5f' % gbdt_test_log_loss) else: y_pred_gbdt = gbdt.predict_proba(train[gbdt_features].tail(57562))[:, 1] gbdt_test_log_loss = log_loss(train[target].tail(57562), y_pred_gbdt) print('gbdt log_loss: %.5f' % gbdt_test_log_loss) # GBDT编码原有特征 X_train_leaves = gbdt.apply(train[gbdt_features])[:,:,0] X_test_leaves = gbdt.apply(test[gbdt_features])[:,:,0] # 对所有特征进行ont-hot编码 (train_rows, cols) = X_train_leaves.shape gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0)) # 定义LR模型 lr = LogisticRegression() # lr对gbdt特征编码后的样本模型训练 lr.fit(X_trans[:train_rows, :], train[target]) # 预测及AUC评测 if isOnline == False: y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] gbdt_lr_test_log_loss1 = log_loss(test[target], y_pred_gbdtlr1) print('基于GBDT特征编码后的LR log_loss: %.5f' % gbdt_lr_test_log_loss1) else: print('Online') # 定义LR模型 lr = LogisticRegression() # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], train[lr_features]]) X_test_ext = hstack([X_trans[train_rows:, :], test[lr_features]]) print("gbdt output",X_trans[:train_rows, :].shape) print("input",train[lr_features].shape) print(X_train_ext.shape) # lr对组合特征的样本模型训练 lr.fit(X_train_ext, train[target]) # 预测及AUC评测 if isOnline == False: y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1] gbdt_lr_test_log_loss2 = log_loss(test[target], y_pred_gbdtlr2) print('基于组合特征的LR log_loss: %.5f' % gbdt_lr_test_log_loss2) else: print('Online') test['predicted_score'] = lr.predict_proba(X_test_ext)[:, 1] print(test['predicted_score'].head(5)) print(len(test)) test[['instance_id', 'predicted_score']].to_csv('../baseline_' + name +'.csv', index=False,sep=' ')#保存在线提交结果 print('Saved result success!')
def gbdt_lr_train(libsvmFileName): # load样本数据 X_all, y_all = load_svmlight_file(libsvmFileName) # 训练/测试数据分割 X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.3, random_state=42) print "train data shape: ", X_train.shape # 模型训练 gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0, max_features=0.5) gbdt.fit(X_train, y_train) # 预测及AUC评测 y_pred_gbdt = gbdt.predict_proba(X_test.toarray())[:, 1] gbdt_auc = roc_auc_score(y_test, y_pred_gbdt) print('gbdt auc: %.5f' % gbdt_auc) # lr对原始特征样本模型训练 lr = LogisticRegression() lr.fit(X_train, y_train) # 预测及AUC评测 y_pred_test = lr.predict_proba(X_test)[:, 1] lr_test_auc = roc_auc_score(y_test, y_pred_test) print('基于原有特征的LR AUC: %.5f' % lr_test_auc) # GBDT编码原有特征 X_train_leaves = gbdt.apply(X_train)[:, :, 0] X_test_leaves = gbdt.apply(X_test)[:, :, 0] print "gbdt leaves shape: ", X_train_leaves.shape for i in range(0, len(X_train_leaves[0])): cateMap = {} for j in range(0, len(X_train_leaves)): cateMap[X_train_leaves[j][i]] = 0 print "F%d: %d" % (i, len(cateMap)) # 对所有特征进行ont-hot编码 (train_rows, cols) = X_train_leaves.shape gbdtenc = OneHotEncoder(sparse=False, categories='auto') X_trans = gbdtenc.fit_transform( np.concatenate((X_train_leaves, X_test_leaves), axis=0)) print "gbdt oneHot shape: ", X_trans.shape print "oneHot leaves: ", X_trans[0] # 定义LR模型 lr = LogisticRegression() # lr对gbdt特征编码后的样本模型训练 lr.fit(X_trans[:train_rows, :], y_train) # 预测及AUC评测 y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1) print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1) # 定义LR模型 lr = LogisticRegression(n_jobs=-1) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], X_train]) X_test_ext = hstack([X_trans[train_rows:, :], X_test]) print "gbdt leaves cross", X_train_ext.shape # lr对组合特征的样本模型训练 lr.fit(X_train_ext, y_train) # 预测及AUC评测 y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1] gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2) print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)
offline.FLAG = offline['FLAG'].astype(np.float64) fpr, tpr, thresholds = metrics.roc_curve(offline.FLAG,offline['preds']) print('AUC:',metrics.auc(fpr, tpr)) #线上 print("线上预测") online=DataFrame() preds_online = lgb_model.predict(online_test_X, num_iteration=lgb_model.best_iteration) # 输出概率 online['USRID']=test_data['USRID'].astype(int) online['RST']=preds_online online.to_csv("test_result.csv",index=False,sep='\t') #生成特征后的线下预测 from sklearn.preprocessing import OneHotEncoder onehot=OneHotEncoder() train_new_feature=lgb_model.predict(train_data.drop(['USRID','FLAG'],axis=1),pred_leaf=True) test_new_feature=lgb_model.predict(test_data.drop(['USRID'],axis=1),pred_leaf=True) train_new_feature=onehot.fit_transform(train_new_feature) test_new_feature=onehot.fit_transform(test_new_feature) train_new=pd.concat([train_data,DataFrame(train_new_feature.toarray())],axis=1) test_new=pd.concat([test_data,DataFrame(test_new_feature.toarray())],axis=1) #建立LR模型 from sklearn.linear_model import LogisticRegressionCV LR1=LogisticRegressionCV(Cs=[0.001,0.01,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,10,20,30,100], cv=3,penalty='l2',solver='lbfgs',scoring='auc',class_weight='balanced', n_jobs=-1,random_state=42,verbose=1) LR1.fit(train_new.drop(['USRID','FLAG'],axis=1),train_new.FLAG) ### 线下预测 print ("线下预测") preds_offline = lgb_model.predict(offline_test_X, num_iteration=lgb_model.best_iteration) # 输出概率 offline=offline_test[['USRID','FLAG']]
data_process(train_agg) data_process(test_agg) del a,gp,gp_day_mean,gp_day_var,gp1,gp2,gp3,gp4,index1,l,m1,m2,m3,merge_log,ss,ss2,t1,t2,t3,train_flg #gbdt 构造新特征 gbdt = GradientBoostingClassifier(loss='exponential',learning_rate=0.12,n_estimators=60, max_depth=3,random_state=42,max_features=None) X_train=train_agg.drop(['USRID','FLAG'],axis=1) y_train=train_agg['FLAG'] # 训练学习 gbdt.fit(X_train, y_train) # GBDT编码原有特征 X_train_leaves = gbdt.apply(X_train)[:,:,0] X_test_leaves=gbdt.apply(test_agg.drop('USRID',axis=1))[:,:,0] (train_rows, cols) = X_train_leaves.shape onehot = OneHotEncoder() X_trans = onehot.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0)) # 组合特征 X_train_agg = DataFrame(hstack([X_trans[:train_rows, :], train_agg]).toarray()) X_test_agg = DataFrame(hstack([X_trans[train_rows:, :], test_agg]).toarray()) X_train_agg.rename(columns={494: "USRID",495:"FLAG"},inplace=True) X_test_agg.rename(columns={494: "USRID"},inplace=True) #训练集和测试集 train_data=pd.merge(X_train_agg,train_log,on='USRID',how='left') test_data=pd.merge(X_test_agg,test_log,on='USRID',how='left') del X_train_agg,X_test_agg,train_log,test_log #建模 import lightgbm as lgb train_xy,offline_test = train_test_split(train_data,test_size = 0.3,random_state=42)
def _run_one_hot(X, X2, cat): enc = OneHotEncoder(categorical_features=cat) Xtr = enc.fit_transform(X) X2tr = enc.transform(X2) return Xtr, X2tr
data.loc[data['Apartment'] == i, 'Price'].mean() for i in range(1, 5): data.loc[data['Beds'] == i, 'expensive than average bed'] = data.loc[data['Beds'] == i, 'Price'] - \ data.loc[data['Beds'] == i, 'Price'].mean() threshold1 = Binarizer(threshold=3.0) res1 = pd.DataFrame(threshold1.transform(data['Review'].values.reshape(-1, 1))) threshold2 = Binarizer(threshold=80) res2 = pd.DataFrame(threshold2.transform(data['Price'].values.reshape(-1, 1))) pf = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False) res3 = pd.DataFrame( pf.fit_transform( data[['Apartment', 'Beds', 'Review', 'Pic Quality', 'Price']])) encoder = OneHotEncoder() data_region1hot = encoder.fit_transform(data['Region'].values.reshape(-1, 1)) data_region = pd.DataFrame(data_region1hot.toarray()) data_weekday1hot = encoder.fit_transform(data['Weekday'].values.reshape(-1, 1)) data_weekday = pd.DataFrame(data_weekday1hot.toarray()) data_reformed = pd.concat( [data.drop(columns=['ID']), data_region, data_weekday, res1, res2, res3], axis=1) Seed = 40 split = StratifiedShuffleSplit(n_splits=2, test_size=0.3, random_state=Seed) for train_index, test_index in split.split(data_reformed, data_reformed['Accept']): train = data_reformed.loc[train_index] test = data_reformed.loc[test_index] train_data = train.loc[:, train.columns != 'Accept']
fin.close() sents = nltk.sent_tokenize(" ".join(lines)) tokenizer = Tokenizer(5000) tokens = tokenizer.fit_on_texts(sents) vocab_size = len(tokenizer.word_counts) + 1 xs = [] ys = [] for sent in sents: embedding = one_hot(sent, vocab_size) triples = list(nltk.trigrams(embedding)) w_lefts = [x[0] for x in triples] w_centers = [x[1] for x in triples] w_rights = [x[2] for x in triples] xs.extend(w_centers) ys.extend(w_lefts) xs.extend(w_centers) ys.extend(w_rights) ohe = OneHotEncoder(n_values=vocab_size) X = ohe.fit_transform(np.array(xs).reshape(-1, 1)).todense() Y = ohe.fit_transform(np.array(ys).reshape(-1, 1)).todense() Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3, random_state=42) print(Xtrain.shape, Xtest.shape, Ytrain.shape, Ytest.shape) print('End')
_, xgb_calibration = calibration(preds, dtest) print('训练Xgb模型 auc: {:.4f}, ne: {:.4f}, logloss: {:.4f}, calibration: {:.4f}'. format(xgb_auc1, xgb_ne, xgb_lls, xgb_calibration)) # print("test top5 pred: {}".format([(labels[i], preds[i]) for i in range(5)])) # onehot dr_leaves = bst.predict(dtrain, pred_leaf=True) dt_leaves = bst.predict(dtest, pred_leaf=True) all_leaves = np.concatenate((dr_leaves, dt_leaves), axis=0) all_leaves = all_leaves.astype(np.int32) # print("dtrain leaves index: \n{}".format(dr_leaves[:5])) xgb_enc = OneHotEncoder() X_trans = xgb_enc.fit_transform(all_leaves) # print("X_train top5 onehot encode:") # for i in range(5): # print(X_trans[i].toarray()) (train_rows, cols) = dr_leaves.shape print("\nnew x_train shape for lr: {}".format(dr_leaves.shape)) # 定义LR模型 lr = LogisticRegression(max_iter=200) # lr对xgboost特征编码后的样本模型训练 lr.fit(X_trans[:train_rows, :], dtrain.get_label()) # 预测及AUC评测 y_pred_xgblr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] xgb_lr_auc1 = roc_auc_score(dtest.get_label(), y_pred_xgblr1) _, xgb_lr_ne = normalized_entropy(y_pred_xgblr1, dtest) _, xgb_lr_lls = log_loss(y_pred_xgblr1, dtest)
def get_cat_one_hot_feature(tag='val'): if tag == 'train': print("获取类别特征one_hot线上提交数据") print("训练集长度: " + '478032') print("测试集长度:" + '42888') path = config.cache_prefix_path + 'cat_one_hot_train.npz' if os.path.exists(path): cat_one_hot_train = utils.load_sparse_csr(path) return cat_one_hot_train data = pd.read_pickle(config.data_prefix_path + 'data.pkl')[config.CAT_COLS] labelEncoding = LabelEncoder() for col in data.head(0): data[col] = labelEncoding.fit_transform(data[col].astype(str)) onehotEncoding = OneHotEncoder() data = onehotEncoding.fit_transform(data) print(data.shape) utils.save_sparse_csr(path, data) return data elif tag == 'val': print("获取类别特征one_hot线下验证数据") print("训练集长度: " + '420627') print("验证集长度:" + '57405') path = config.cache_prefix_path + 'cat_one_hot_val.npz' if os.path.exists(path): cat_one_hot_val = utils.load_sparse_csr(path) return cat_one_hot_val data = pd.read_pickle(config.data_prefix_path + 'data.pkl')[config.CAT_COLS + ['day']] train = data[data.day < 24] test = data[data.day == 24] del data gc.collect() train.drop(['day'], axis=1, inplace=True) test.drop(['day'], axis=1, inplace=True) data = pd.concat([train, test], axis=0) del train, test gc.collect() labelEncoding = LabelEncoder() for col in data.head(0): data[col] = labelEncoding.fit_transform(data[col].astype(str)) onehotEncoding = OneHotEncoder() data = onehotEncoding.fit_transform(data) print(data.shape) utils.save_sparse_csr(path, data) return data
def get_xgboost_one_hot_feature(tag='val'): params = { 'booster': 'gbtree', 'num_leaves': 35, 'max_depth': 7, 'eta': 0.05, 'max_bin': 425, 'subsample_for_bin': 50000, 'objective': 'binary:logistic', 'min_split_gain': 0, 'min_child_weight': 6, 'min_child_samples': 10, #'colsample_bytree':0.8,#在建立树时对特征采样的比例。缺省值为1 #'subsample':0.9,#用于训练模型的子样本占整个样本集合的比例。 'subsample_freq': 1, 'colsample_bytree': 1, 'reg_lambda': 4, # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。 'alpha': 4, #L1正则化 'seed': 2018, 'nthread': 7, 'silent': True, 'gamma': 0.2, 'eval_metric': 'logloss' } object_features = [ "predict_category_1", "predict_category_2", "predict_category_0", "predict_property_0", "predict_property_1", "predict_property_2", "property_1", "property_0", "property_2", "category_1", "category_0", "category_2", 'category_cross_0', 'category_cross_1', 'category_cross_2', 'hour_and_category_1', 'user_gender_id', 'user_occupation_id', ] if tag == 'train': print("获取xgboost特征one_hot线上提交数据") print("训练集长度: " + '478032') print("测试集长度:" + '42888') path = config.cache_prefix_path + 'xgboost_one_hot_train.npz' if os.path.exists(path): xgboost_one_hot_train = utils.load_sparse_csr(path) return xgboost_one_hot_train data = pd.read_pickle(config.data_prefix_path + 'data.pkl') features = [ c for c in data.columns if c not in [ 'is_trade', 'instance_id', 'index', 'context_id', 'time', 'day', 'context_timestamp', 'property_list', 'category_list', 'property_predict_list', 'category_predict_list', 'item_category_list', 'item_property_list', 'predict_category_property', 'user_id', 'item_id', 'item_brand_id', 'item_city_id', 'shop_id', ] and c not in object_features ] target = ['is_trade'] train = data[data.is_trade.notnull()] test = data[data.is_trade.isnull()] del data gc.collect() xgb_train = xgb.DMatrix(train[features], label=train[target]) xgb_test = xgb.DMatrix(test[features]) del train, test gc.collect() model = xgb.train(params, xgb_train, 200, [(xgb_train, 'train')]) train_leaves = model.predict(xgb_train, pred_leaf=True) test_leaves = model.predict(xgb_test, pred_leaf=True) del xgb_train, xgb_test gc.collect() onehotEncoding = OneHotEncoder() trans = onehotEncoding.fit_transform( np.concatenate((train_leaves, test_leaves), axis=0)) utils.save_sparse_csr(path, trans) return trans elif tag == 'val': print("获取xgboost特征one_hot线下验证数据") print("训练集长度: " + '420627') print("测试集长度:" + '57405') path = config.cache_prefix_path + 'xgboost_one_hot_val.npz' if os.path.exists(path): xgboost_one_hot_val = utils.load_sparse_csr(path) return xgboost_one_hot_val data = pd.read_pickle(config.data_prefix_path + 'data.pkl') features = [ c for c in data.columns if c not in [ 'is_trade', 'instance_id', 'index', 'context_id', 'time', 'day', 'context_timestamp', 'property_list', 'category_list', 'property_predict_list', 'category_predict_list', 'item_category_list', 'item_property_list', 'predict_category_property', 'user_id', 'item_id', 'item_brand_id', 'item_city_id', 'shop_id', ] and c not in object_features ] target = ['is_trade'] data = data[data.is_trade.notnull()] train = data[data.day < 24] val = data[data.day == 24] xgb_train = xgb.DMatrix(train[features], label=train[target]) xgb_val = xgb.DMatrix(val[features], label=val[target]) del train, val, data gc.collect() model = xgb.train(params, xgb_train, 200, [(xgb_train, 'train'), (xgb_val, 'valid')]) train_leaves = model.predict(xgb_train, pred_leaf=True) val_leaves = model.predict(xgb_val, pred_leaf=True) del xgb_train, xgb_val gc.collect() onehotEncoding = OneHotEncoder() trans = onehotEncoding.fit_transform( np.concatenate((train_leaves, val_leaves), axis=0)) utils.save_sparse_csr(path, trans) return trans
def gbdt_lr_train(libsvmFileName): # load样本数据 X_all, y_all = load_svmlight_file(libsvmFileName) # X_all_dense = X_all.todense() print(type(X_all)) # print(type(X_all_dense[0])) # print(y_all) # print("===") # 训练/测试数据分割 X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.3, random_state=42) # print(X_train) # print(y_train) # 定义GBDT模型 gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0, max_features=0.5) # 训练学习 gbdt.fit(X_train, y_train) # 预测及AUC评测 toarray = X_test.toarray() print(type(toarray)) y_pred_gbdt = gbdt.predict_proba(toarray) # print(y_pred_gbdt) y_pred_gbdt = gbdt.predict_proba(toarray)[:, 1] gbdt_auc = roc_auc_score(y_test, y_pred_gbdt) print('gbdt auc: %.5f' % gbdt_auc) # gbdt auc: 0.96455 # lr对原始特征样本模型训练 lr = LogisticRegression() lr.fit(X_train, y_train) # 预测及AUC评测 y_pred_test = lr.predict_proba(X_test)[:, 1] lr_test_auc = roc_auc_score(y_test, y_pred_test) print('基于原有特征的LR AUC: %.5f' % lr_test_auc) # 基于原有特征的LR AUC: 0.93455 # GBDT编码原有特征 # X_train_leaves = gbdt.apply(X_train) X_train_leaves = gbdt.apply(X_train)[:, :, 0] np.set_printoptions(linewidth=400) np.set_printoptions(threshold=np.inf) # print(X_train_leaves[0:22,:]) # 打印22行,所有列 print(type(X_train_leaves)) X_test_leaves = gbdt.apply(X_test)[:, :, 0] # 对所有特征进行ont-hot编码 (train_rows, cols) = X_train_leaves.shape print(train_rows, cols) gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform( np.concatenate((X_train_leaves, X_test_leaves), axis=0)) print(X_trans.shape) # print(X_trans.todense()[0:22,:]) # 定义LR模型 lr = LogisticRegression() # lr对gbdt特征编码后的样本模型训练 lr.fit(X_trans[:train_rows, :], y_train) # 预测及AUC评测 # print(X_trans[train_rows:, :]) y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1) print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1) # 定义LR模型 lr = LogisticRegression(n_jobs=-1) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], X_train]) X_test_ext = hstack([X_trans[train_rows:, :], X_test]) print("组合特征的个数:", X_train_ext.shape) # lr对组合特征的样本模型训练 lr.fit(X_train_ext, y_train) # 预测及AUC评测 y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1] gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2) print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)