def choose_xgb_config: #xgboost _tmp_config = { 'acc' : 0, 'dep' : None, 'est' : None } for depth_ in [4,6,8,10,12,16,20,32,64]: for est in [100,200,300,500,700,1000,1200,1400,1500]: model = xgbr(max_depth=depth_, n_estimators=est, eval_metric='rmse', n_jobs=4, learning_rate=0.18 ) model.fit(X_train, Y_train) accuracy = model.score(X_test, Y_test) if accuracy > _tmp_config['acc']: _tmp_config['acc'], _tmp_config['dep'], _tmp_config['est'] = (accuracy, depth_, est) print("Accuracy: {}, dep: {}, est: {}".format(accuracy * 100.0, depth_, est)) return _tmp_config
X_test.head() #%% # Make a CSV backup of our test data without column names or indexes test_data = X_test.copy() test_data['target'] = y_test test_data.to_csv("test_data.csv", header=False, index=False) #%% test_data.head() #%% model = xgbr() model.fit(X_train, y_train, eval_set=[(X_test, y_test)]) #%% conv_model = convert_xgboost(model, initial_types=[('float_input', FloatTensorType(shape=[1, 4]))]) assert(conv_model is not None) save_model(conv_model, 'model.onnx') #%%
def main(): # random number initialization np.random.seed(123456000) # preprocess data by PCA and standardization Xtrain__full, ytrain__full, Xtest = load_data(argv[1], argv[2]) # Xtrain__full, ytrain__full, Xtest = load_data("train_data.csv","test_data.csv") Xtrain__full, ytrain__full, Xtest = preprocess(Xtrain__full, ytrain__full, Xtest) # train-set and validation-set split X_train, X_val, y_train, y_val = train_test_split(Xtrain__full, ytrain__full, test_size=0.20, random_state=None) # ============================================================================================================ print(" ") print(" ") print("Linear regressor classifier") start_time = time.time() LR = regressor(0.01) LR.fit(X_train, y_train) show_performance(LR, X_train, y_train, "Train") show_performance(LR, X_val, y_val, "Validation") show_time(time.time() - start_time) # ============================================================================================================ print("Stochastic gradient descent regressor classifier") start_time = time.time() SGDR = SGDRegressor(loss='huber', penalty='elasticnet', max_iter=100, eta0=0.01) SGDR.fit(X_train, y_train.flatten()) show_performance(SGDR, X_train, y_train, "Train") show_performance(SGDR, X_val, y_val, "Validation") show_time(time.time() - start_time) # ============================================================================================================ print("Neural network classifier") start_time = time.time() def baseline_model(D): # Defining the NN based regressor model = Sequential() model.add( Dense(D, input_dim=D, kernel_initializer='glorot_uniform', activation='relu')) model.add(Dropout(0.25)) model.add( Dense(D, input_dim=D, kernel_initializer='glorot_uniform', activation='relu')) model.add(Dropout(0.25)) model.add(Dense(1, kernel_initializer='glorot_uniform')) model.compile(loss='mae', optimizer='adam', metrics=['mae']) return model _, D = np.shape(X_train) # KR = KerasRegressor(build_fn=baseline_model(D), epochs=30, batch_size=16, verbose=False) KR = baseline_model(D) KR.fit(X_train, y_train, epochs=100, batch_size=16, verbose=False) show_performance(KR, X_train, y_train, "Train") show_performance(KR, X_val, y_val, "Validation") show_time(time.time() - start_time) # ============================================================================================================ print("Extratrees regressor classifier") start_time = time.time() ET = ExtraTreesRegressor(n_estimators=200, criterion='mae', min_samples_split=2, min_samples_leaf=1) ET.fit(X_train, y_train.flatten()) show_performance(ET, X_train, y_train, "Train") show_performance(ET, X_val, y_val, "Validation") show_time(time.time() - start_time) # ============================================================================================================ print("Extreme gradient boosted regressor classifier") start_time = time.time() n = Xtrain__full.shape[1] XGBR = xgbr(n_estimators=400, max_depth=int(np.sqrt(n))) XGBR.fit(X_train, y_train.flatten()) show_performance(XGBR, X_train, y_train, "Train") show_performance(XGBR, X_val, y_val, "Validation") show_time(time.time() - start_time) # ============================================================================================================ print("Soft voting over best performing ET and XGBR classifiers") temp1 = ET.predict(X_val) temp2 = XGBR.predict(X_val) temp = np.average([temp1, temp2], axis=0, weights=[7, 10]) mae = mean_absolute_error(y_val, temp) print("Validation MAE: %f" % mae) # ============================================================================================================ print(" ") print(" ") print("Writing out the results") temp1 = ET.predict(Xtest) temp2 = XGBR.predict(Xtest) temp = np.average([temp1, temp2], axis=0, weights=[7, 10]) predictions = temp.astype(int) df = pd.read_csv(argv[2]) # df = pd.read_csv("test_data.csv") df['predicted_ground_truth'] = predictions df.to_csv(argv[2], index=False) # df.to_csv('test_data.csv', index=False) print("Task completed")
model_3.add(Dropout(0.2)) model_3.add(Flatten()) model_3.add(Dense(1024, activation='relu')) model_3.add(Dense(512, activation='relu')) model_3.add(Dropout(0.3)) model_3.add(Dense(1, activation='linear')) model_3.compile(loss="mse", optimizer="adam", metrics=["accuracy", "mse"]) print(model_3.summary()) history_3 = model_3.fit( X_train, y_train, batch_size = 16, epochs = 2000 ) model = xgbr(max_depth = 8, n_estimators = 118, eval_metric = 'rmse', n_jobs = 4, learning_rate = 0.2 ) model.fit(X_train, Y_train) accuracy = model.score(X_test, Y_test) print("Accuracy: %.4f%%" % (accuracy * 100.0)) plt.rcParams['figure.figsize'] = (30, 30) xgboost.plot_tree(model, num_trees=1) plt.show()
from sklearn.linear_model import LinearRegression as linear from sklearn.model_selection import KFold, cross_val_score as cvs, train_test_split as tts from sklearn.metrics import mean_squared_error as mse from sklearn.datasets import load_boston import pandas as pd import numpy as np import matplotlib.pyplot as plt from time import time import datetime data = load_boston() x = data.data y = data.target xtrain, xtest, ytrain, ytest = tts(x, y, test_size=0.3, random_state=420) reg = xgbr(n_setimators=100).fit(xtrain, ytrain) #预测值&score reg.predict(xtest) reg.score(xtest, ytest) # 均方误差和均值 mse(ytest, reg.predict(xtest)) y.mean() #特征的重要性 reg.feature_importances_ #3.以下用交叉验证来对比,xgbr 随机森林,线性回归 reg = xgbr(n_eatimators=100) cvs(reg, xtrain, ytrain, cv=5).mean() # 交叉验证 cvs(reg, xtrain, ytrain, cv=5, scoring='neg_mean_squared_error').mean() rfr = rfr(n_estimators=100)
# grid_search.fit(train_data, price) # evalute_result = grid_search.cv_results_ # print('每轮的迭代结果:{}'.format(evalute_result)) # print('best_params:', grid_search.best_params_) # print('best_score:', grid_search.best_score_) # print('GridSearchCV process use %.2f seconds'%(time.time()-start)) # 模型定义 print('xgb模型训练中...') model_xgb = xgbr(learning_rate=0.1, # 0.1, default=0.3 n_estimators=1000, # 1000 max_depth=10, # 10, default=6 min_child_weight=2, # 2, default=1 subsample=0.8, # 0.8, default=1 colsample_bytree=0.9, # 0.9, default=1 gamma=0.7, # 0.7, default=0 reg_alpha=0, # 0, default=0 reg_lambda=0.1, # 0.1, default=1 n_jobs=8) model_xgb.fit(train_data, price) y_pred_xgb = model_xgb.predict(test_data) # y_pred_xgb = np.round(np.exp(y_pred_xgb), 0) print('lgb模型训练中...') model_lgb = lgb.LGBMRegressor(n_estimators=500, learning_rate=0.1, n_jobs=8) model_lgb.fit(train_data, price) y_pred_lgb = model_lgb.predict(test_data) # y_pred_lgb = np.round(np.exp(y_pred_lgb), 0) print('gbdt模型训练中...')