Exemplo n.º 1
0
def choose_xgb_config:
  #xgboost
  _tmp_config = {
    'acc' : 0,
    'dep' : None,
    'est' : None
  }
  for depth_ in [4,6,8,10,12,16,20,32,64]: 
      for est in [100,200,300,500,700,1000,1200,1400,1500]: 
          model = xgbr(max_depth=depth_,
                       n_estimators=est,
                       eval_metric='rmse', 
                       n_jobs=4,
                       learning_rate=0.18
          )
          model.fit(X_train, Y_train)
          accuracy = model.score(X_test, Y_test)
          if accuracy > _tmp_config['acc']:
            _tmp_config['acc'], _tmp_config['dep'], _tmp_config['est'] = (accuracy, depth_, est)
          print("Accuracy: {}, dep: {}, est: {}".format(accuracy * 100.0, depth_, est))
    return _tmp_config
Exemplo n.º 2
0
X_test.head()


#%%
# Make a CSV backup of our test data without column names or indexes
test_data = X_test.copy()
test_data['target'] = y_test
test_data.to_csv("test_data.csv", header=False, index=False)


#%%
test_data.head()


#%%
model = xgbr()
model.fit(X_train, y_train, eval_set=[(X_test, y_test)])


#%%
conv_model = convert_xgboost(model, initial_types=[('float_input', FloatTensorType(shape=[1, 4]))])
assert(conv_model is not None)

save_model(conv_model, 'model.onnx')


#%%



Exemplo n.º 3
0
def main():

    # random number initialization
    np.random.seed(123456000)

    # preprocess data by PCA and standardization
    Xtrain__full, ytrain__full, Xtest = load_data(argv[1], argv[2])
    # Xtrain__full, ytrain__full, Xtest = load_data("train_data.csv","test_data.csv")
    Xtrain__full, ytrain__full, Xtest = preprocess(Xtrain__full, ytrain__full,
                                                   Xtest)

    # train-set and validation-set split
    X_train, X_val, y_train, y_val = train_test_split(Xtrain__full,
                                                      ytrain__full,
                                                      test_size=0.20,
                                                      random_state=None)

    # ============================================================================================================
    print(" ")
    print(" ")
    print("Linear regressor classifier")
    start_time = time.time()
    LR = regressor(0.01)
    LR.fit(X_train, y_train)

    show_performance(LR, X_train, y_train, "Train")
    show_performance(LR, X_val, y_val, "Validation")

    show_time(time.time() - start_time)

    # ============================================================================================================
    print("Stochastic gradient descent regressor classifier")
    start_time = time.time()

    SGDR = SGDRegressor(loss='huber',
                        penalty='elasticnet',
                        max_iter=100,
                        eta0=0.01)
    SGDR.fit(X_train, y_train.flatten())

    show_performance(SGDR, X_train, y_train, "Train")
    show_performance(SGDR, X_val, y_val, "Validation")

    show_time(time.time() - start_time)

    # ============================================================================================================
    print("Neural network classifier")
    start_time = time.time()

    def baseline_model(D):

        # Defining the NN based regressor
        model = Sequential()
        model.add(
            Dense(D,
                  input_dim=D,
                  kernel_initializer='glorot_uniform',
                  activation='relu'))
        model.add(Dropout(0.25))
        model.add(
            Dense(D,
                  input_dim=D,
                  kernel_initializer='glorot_uniform',
                  activation='relu'))
        model.add(Dropout(0.25))
        model.add(Dense(1, kernel_initializer='glorot_uniform'))
        model.compile(loss='mae', optimizer='adam', metrics=['mae'])

        return model

    _, D = np.shape(X_train)
    # KR = KerasRegressor(build_fn=baseline_model(D), epochs=30, batch_size=16, verbose=False)
    KR = baseline_model(D)
    KR.fit(X_train, y_train, epochs=100, batch_size=16, verbose=False)

    show_performance(KR, X_train, y_train, "Train")
    show_performance(KR, X_val, y_val, "Validation")

    show_time(time.time() - start_time)

    # ============================================================================================================
    print("Extratrees regressor classifier")
    start_time = time.time()

    ET = ExtraTreesRegressor(n_estimators=200,
                             criterion='mae',
                             min_samples_split=2,
                             min_samples_leaf=1)
    ET.fit(X_train, y_train.flatten())

    show_performance(ET, X_train, y_train, "Train")
    show_performance(ET, X_val, y_val, "Validation")

    show_time(time.time() - start_time)

    # ============================================================================================================
    print("Extreme gradient boosted regressor classifier")
    start_time = time.time()

    n = Xtrain__full.shape[1]
    XGBR = xgbr(n_estimators=400, max_depth=int(np.sqrt(n)))
    XGBR.fit(X_train, y_train.flatten())

    show_performance(XGBR, X_train, y_train, "Train")
    show_performance(XGBR, X_val, y_val, "Validation")

    show_time(time.time() - start_time)

    # ============================================================================================================
    print("Soft voting over best performing ET and XGBR classifiers")
    temp1 = ET.predict(X_val)
    temp2 = XGBR.predict(X_val)
    temp = np.average([temp1, temp2], axis=0, weights=[7, 10])
    mae = mean_absolute_error(y_val, temp)
    print("Validation MAE: %f" % mae)

    # ============================================================================================================
    print(" ")
    print(" ")
    print("Writing out the results")
    temp1 = ET.predict(Xtest)
    temp2 = XGBR.predict(Xtest)
    temp = np.average([temp1, temp2], axis=0, weights=[7, 10])
    predictions = temp.astype(int)

    df = pd.read_csv(argv[2])
    # df = pd.read_csv("test_data.csv")
    df['predicted_ground_truth'] = predictions
    df.to_csv(argv[2], index=False)
    # df.to_csv('test_data.csv', index=False)
    print("Task completed")
Exemplo n.º 4
0
model_3.add(Dropout(0.2))
model_3.add(Flatten())
model_3.add(Dense(1024, activation='relu'))
model_3.add(Dense(512, activation='relu'))
model_3.add(Dropout(0.3))
model_3.add(Dense(1, activation='linear'))
model_3.compile(loss="mse", 
              optimizer="adam", 
              metrics=["accuracy", "mse"])
print(model_3.summary())

history_3 = model_3.fit(
    X_train, y_train, 
    batch_size = 16,
    epochs = 2000
)

model = xgbr(max_depth = 8,
             n_estimators = 118,
             eval_metric = 'rmse', 
             n_jobs = 4,
             learning_rate = 0.2
)
model.fit(X_train, Y_train)
accuracy = model.score(X_test, Y_test)
print("Accuracy: %.4f%%" % (accuracy * 100.0))

plt.rcParams['figure.figsize'] = (30, 30)
xgboost.plot_tree(model, num_trees=1)
plt.show()
Exemplo n.º 5
0
from sklearn.linear_model import LinearRegression as linear
from sklearn.model_selection import KFold, cross_val_score as cvs, train_test_split as tts
from sklearn.metrics import mean_squared_error as mse
from sklearn.datasets import load_boston
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from time import time
import datetime

data = load_boston()
x = data.data
y = data.target

xtrain, xtest, ytrain, ytest = tts(x, y, test_size=0.3, random_state=420)
reg = xgbr(n_setimators=100).fit(xtrain, ytrain)
#预测值&score
reg.predict(xtest)
reg.score(xtest, ytest)
# 均方误差和均值
mse(ytest, reg.predict(xtest))
y.mean()
#特征的重要性
reg.feature_importances_

#3.以下用交叉验证来对比,xgbr 随机森林,线性回归
reg = xgbr(n_eatimators=100)
cvs(reg, xtrain, ytrain, cv=5).mean()  # 交叉验证
cvs(reg, xtrain, ytrain, cv=5, scoring='neg_mean_squared_error').mean()

rfr = rfr(n_estimators=100)
Exemplo n.º 6
0
    # grid_search.fit(train_data, price)
    # evalute_result = grid_search.cv_results_
    # print('每轮的迭代结果:{}'.format(evalute_result))
    # print('best_params:', grid_search.best_params_)
    # print('best_score:', grid_search.best_score_)
    # print('GridSearchCV process use %.2f seconds'%(time.time()-start))



    # 模型定义
    print('xgb模型训练中...')
    model_xgb = xgbr(learning_rate=0.1, # 0.1, default=0.3
               n_estimators=1000, # 1000
               max_depth=10, # 10, default=6
               min_child_weight=2, # 2, default=1
               subsample=0.8, # 0.8, default=1
               colsample_bytree=0.9, # 0.9, default=1
               gamma=0.7, # 0.7, default=0
               reg_alpha=0, # 0, default=0
               reg_lambda=0.1, # 0.1, default=1
               n_jobs=8)
    model_xgb.fit(train_data, price)
    y_pred_xgb = model_xgb.predict(test_data)
    # y_pred_xgb = np.round(np.exp(y_pred_xgb), 0)

    print('lgb模型训练中...')
    model_lgb = lgb.LGBMRegressor(n_estimators=500, learning_rate=0.1, n_jobs=8)
    model_lgb.fit(train_data, price)
    y_pred_lgb = model_lgb.predict(test_data)
    # y_pred_lgb = np.round(np.exp(y_pred_lgb), 0)

    print('gbdt模型训练中...')