min_child_weight=6,
                   n_estimators=1000,
                   max_depth=7,
                   colsample_bytree=0.6)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)
accuracy = xgb.score(X_test, y_test)
'Accuracy: ' + str(np.round(accuracy * 100, 2)) + '%'
mean_absolute_error(y_test, xgb_pred)
mean_squared_error(y_test, xgb_pred)
np.sqrt(mean_squared_error(y_test, xgb_pred))

lgb = LGBMRegressor(objective='regression')
lgb.fit(X_train, y_train)
lgb_pred = lgb.predict(X_test)
accuracy = lgb.score(X_test, y_test)
'Accuracy: ' + str(np.round(accuracy * 100, 2)) + '%'
mean_absolute_error(y_test, lgb_pred)
mean_squared_error(y_test, lgb_pred)
np.sqrt(mean_squared_error(y_test, lgb_pred))

from yellowbrick.regressor import ResidualsPlot
visualizer = ResidualsPlot(xgb)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer

rid_pred_t = rid.predict(X_train)
la_pred_t = la.predict(X_train)
plt.scatter(la_pred_t, y_train, c="blue", marker="s", label="Training data")
plt.scatter(la_pred,
def do():
    train_data = pd.read_csv(
        'D:/testFiles/for_excute_folder/activity_blCodingFormatingESI_2017_5_train_input.csv'
    )
    test_data = pd.read_csv(
        'D:/testFiles/for_excute_folder/activity_blCodingFormatingESI_2017_5_test_input.csv'
    )

    # test_data = pd.read_csv('D:/testFiles/for_excute_folder/activity_blCodingFormatingESI_2017_5_train_input.csv')
    # train_data = pd.read_csv('D:/testFiles/for_excute_folder/activity_blCodingFormatingESI_2017_5_test_input.csv')

    # drop_col_names = ['Global-SystemAdmin']

    train_data = train_data.drop(train_data.columns[0], axis=1)
    test_data = test_data.drop(test_data.columns[0], axis=1)

    train_data = train_data[train_data["TIME_USED"] <= 1000]
    test_data = test_data[test_data["TIME_USED"] <= 1000]

    # train_data = train_data[train_data["ASSIGN_COUNT"] <= 1]
    # test_data = test_data[test_data["ASSIGN_COUNT"] <= 1]

    # train_data = train_data.drop(drop_col_names, axis=1)
    # test_data = test_data.drop(drop_col_names, axis=1)

    train_data['TIME_USED'] = train_data['TIME_USED'] / 60
    test_data['TIME_USED'] = test_data['TIME_USED'] / 60

    train_data['TIME_USERD_MEDIAN_S2'] = train_data['TIME_USERD_MEDIAN']**2
    test_data['TIME_USERD_MEDIAN_S2'] = test_data['TIME_USERD_MEDIAN']**2

    # bkgOffice_median_by_task_type

    train_data['TIME_USERD_MEDIAN_S3'] = train_data[
        'TIME_USERD_MEDIAN'] * train_data['bkgOffice_median_by_task_type']
    test_data['TIME_USERD_MEDIAN_S3'] = test_data[
        'TIME_USERD_MEDIAN'] * test_data['bkgOffice_median_by_task_type']

    # train_data = train_data[
    #     ['TIME_USED', 'TIME_USERD_MEDIAN', 'Freight_Type=Semi Auto', 'Freight_Type=No rate', 'TIME_USERD_COUNT', 'TIME_USERD_VAR', 'AWAY_COUNT',
    #      'AWAY_MEAN', 'TIME_USED_BY_REGION' ,'COUNT_BY_REGION', 'TIME_USED_VAR_BY_REGION']]
    # test_data = test_data[
    #     ['TIME_USED', 'TIME_USERD_MEDIAN', 'Freight_Type=Semi Auto', 'Freight_Type=No rate', 'TIME_USERD_COUNT', 'TIME_USERD_VAR', 'AWAY_COUNT',
    #      'AWAY_MEAN', 'TIME_USED_BY_REGION' ,'COUNT_BY_REGION', 'TIME_USED_VAR_BY_REGION']]

    print(test_data.head())

    # print(train_data.describe())

    y_train = train_data['TIME_USED'].values.tolist()
    X_train = train_data.drop(['TIME_USED'], axis=1).values.tolist()

    y_test = test_data['TIME_USED'].values.tolist()
    X_test = test_data.drop(['TIME_USED'], axis=1).values.tolist()

    # 选一个模型

    # regressor = SGDRegressor(l1_ratio=0.1)
    # regressor = Ridge()
    # regressor = Lasso()
    # regressor = SVR()
    # regressor = RandomForestRegressor(n_estimators=100, n_jobs=-1)
    # regressor = AdaBoostRegressor()
    # regressor = GradientBoostingRegressor(n_estimators=400)
    # regressor = BaggingRegressor()
    # regressor = XGBRegressor(n_estimators=400, learning_rate=0.02, colsample_bytree=0.1, seed=2017)
    regressor = LGBMRegressor(n_estimators=400,
                              learning_rate=0.02,
                              seed=2017,
                              colsample_bytree=1)

    # 用训练集做交叉验证
    # scores = cross_val_score(regressor, X_train, y_train, cv=4, scoring='neg_mean_absolute_error', n_jobs=-1)
    #
    # print('交叉验证R方值:', scores)
    # print('交叉验证R方均值:', np.mean([scores]))

    # 用训练集训练模型
    regressor.fit(X_train, y_train)
    # 用模型预测测试集, 打分方法也是r2
    print('测试集R方值:', regressor.score(X_test, y_test))

    # 对比预测数据与真实数据
    y_predict = regressor.predict(X_test)
    df = DataFrame()
    df['predict'] = y_predict
    df['real'] = y_test
    df['diff'] = y_predict - y_test
    print(df.head(20))

    print('MAE =  ', mean_absolute_error(y_test, y_predict))
    print('MSE =  ', mean_squared_error(y_test, y_predict))
    print('R2 =  ', r2_score(y_test, y_predict))

    print('feature_importances\n')
    print(regressor.feature_importances_
          )  # Only tree based model has this attribute