def XGB(self, x_train, y_train, x_test, y_test):
     x_train, y_train = shuffle(x_train, y_train)
     xgb = XGBRegressor(max_depth=4, subsample=0.9)
     xgb.fit(x_train,y_train)
     y_pred = xgb.predict(x_test).reshape(x_test.shape[0], 1)
     loss = mean_squared_error(y_pred, y_test)
     print loss
     return y_pred, loss
示例#2
0
def Stacking(real_train_tar):
    predictions_train = pd.DataFrame([np.expm1(y_lasso_predict), np.expm1(y_ridge_predict), np.expm1(y_rf_predict), np.expm1(y_xgb_predict)]).T
    sns.pairplot(predictions_train)
    
    learning_rate = [round(float(x), 2) for x in np.linspace(start = .1, stop = .2, num = 11)]
        # Minimum for sum of weights for observations in a node
    min_child_weight = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
        # Maximum nodes in each tree
    max_depth = [int(x) for x in np.linspace(1, 10, num = 10)]
    n_estimators=[int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)]
    subsample=[0.3, 0.4,0.5,0.6, 0.7]
    stack_model = xgb.XGBRegressor()
    random_grid = {'learning_rate': learning_rate,
                    'max_depth': max_depth,
                    'min_child_weight': min_child_weight,
                    'subsample': subsample,
                    'n_estimators':n_estimators
                    }
    
        # Make a RandomizedSearchCV object with correct model and specified hyperparams
    xgb_stack = RandomizedSearchCV(estimator=stack_model, param_distributions=random_grid, n_iter=1000, cv=5, verbose=2, random_state=42, n_jobs=-1)
    start = time.time()
        # Fit models
    xgb_stack.fit(predictions_train, real_train_tar)
    xgb_stack.best_params_
    write_pkl(xgb_stack.best_params_, '/Users/vickywinter/Documents/NYC/Machine Learning Proj/Pickle/stack_params.pkl')
    
    model_stacking = XGBRegressor(**xgb_stack.best_params_)
    #model_xgb = XGBRegressor(**best_params_)
    start=time.time()
    model_stacking.fit(predictions_train,real_train_tar)
    end=time.time()
    print("MSE for train data is: %f" % mean_squared_error(np.log1p(real_train_tar),np.log1p( model_stacking.predict(predictions_train))))
    print('Time elapsed: %.4f seconds' % (end-start))
    
    
    y_stack_predict=model_stacking.predict(predictions_train)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,y_stack_predict)
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
示例#3
0
# Main advantages are as follows:
# 1. Easy to use
# 2. Computational efficiency
# 3. Model Accuracy
# 4. Feasibility — easy to tune parameters and modify objectives.

# In[86]:


model=XGBRegressor(max_depth=5)


# In[87]:


model.fit(X_train,y_train)


# In[88]:


y_pred=model.predict(X_test)


# In[89]:


print('R2 score using XG Boost= ',r2_score(y_test, y_pred), '/ 1.0')
print('MSE score using XG Boost= ',mean_squared_error(y_test, y_pred), '/ 0.0')

elastic_model_full_data = elasticnet.fit(X, y)

print(datetime.now(), '对一范数lasso收缩模型进行参数训练')
lasso_model_full_data = lasso.fit(X, y)

print(datetime.now(), '对二范数ridge岭回归模型进行参数训练')
ridge_model_full_data = ridge.fit(X, y)

print(datetime.now(), '对svr支持向量机模型进行参数训练')
svr_model_full_data = svr.fit(X, y)

print(datetime.now(), '对GradientBoosting梯度提升模型进行参数训练')
gbr_model_full_data = gbr.fit(X, y)

print(datetime.now(), '对xgboost二阶梯度提升模型进行参数训练')
xgb_model_full_data = xgboost.fit(X, y)


def blend_models_predict(X):
    return ((0.1 * elastic_model_full_data.predict(X)) + \
            (0.05 * lasso_model_full_data.predict(X)) + \
            (0.1 * ridge_model_full_data.predict(X)) + \
            (0.1 * svr_model_full_data.predict(X)) + \
            (0.1 * gbr_model_full_data.predict(X)) + \
            (0.15 * xgb_model_full_data.predict(X)) + \
            (0.3 * stack_gen_model.predict(np.array(X))))


print('融合后的训练模型对原数据重构时的均方根对数误差RMSLE score on train data:')
print(rmsle(y, blend_models_predict(X)))
X_test = lda.transform(X_test)
submit_X = pca.transform(submit_X)

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

y_pred = linear_reg.predict(X_test)
print(mean_absolute_error(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))

# Xgboost (best accuracy)
from xgboost import XGBRegressor
cls_ = XGBRegressor()
cls_.fit(X_train, y_train)
y_pred = cls_.predict(X_test)
print(mean_absolute_error(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))

#support vector machines
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
'''parameters = [{'C':[1,10,100,1000,10000,100000], 'kernel':['linear']},
               {'C':[1,10,100,1000,10000,100000], 'kernel':['poly'],'degree':[1,2,3]}
               ]'''
parameters = [{
    'C': [1, 10, 100, 1000, 10000, 100000],
    'kernel': ['rbf', 'linear']
}]
grid_search = GridSearchCV(
示例#6
0
	mea = getmea(max_leaf_nodes,train_x,val_x,train_y,val_y)
	print("Max_leaf_nodes: %d ,mea: %d" %(max_leaf_nodes,mea))

'''
# clf = XGBRegressor() 17165
# XGBRegressor(n_estimators=400)  16330
'''
params = [.02,.03,.04,.05,.06,.07,.08,.09,.10]#[1:1001:50][100,200,300,400,500]
test_scores = []
for param in params:
    clf = XGBRegressor(n_estimators=400,learning_rate=param)
    test_score = np.sqrt(-cross_val_score(clf, train_X, train_y, cv=10, scoring='neg_mean_squared_error'))
    test_scores.append(np.mean(test_score))
plt.plot(params, test_scores)
plt.title("n_estimator vs CV Error" + str(params));
# 一定要加上这句才能让画好的图显示在屏幕上
plt.show()
'''

my_model = XGBRegressor(n_estimators=400)
my_model.fit(train_X, train_y,verbose=False)
predictions = my_model.predict(test_X)
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_y)))

#save model
#joblib.dump(melbourne_model,'model.pickle')

#load model
#model = joblib.load('model.pickle')

print(x.shape)  #(506, 13)
print(y.shape)  #(506,)

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    train_size=0.8,
                                                    shuffle=True,
                                                    random_state=66)
print(x_test.shape)  #(2000, 71)
print(y_test.shape)  #(2000, 4)
print(type(x_test))  #<class 'numpy.ndarray'>
print(type(y_test))  #<class 'numpy.ndarray'>

model = XGBRegressor()
model.fit(x_train, y_train)
score = model.score(x_test, y_test)
print("R2 :", score)  #R2 : 0.925782578365577
thresholds = np.sort(model.feature_importances_)
#정렬 #중요도가 낮은 것부터 높은것 까지

print(thresholds)

for thresh in thresholds:
    selection = SelectFromModel(model, threshold=thresh, prefit=True)
    #median
    select_x_train = selection.transform(x_train)
    print(select_x_train.shape)

    parameters = [{
        "n_estimators": [1000, 2000, 3000],
test_scores = []
for param in params:
    clf = XGBRegressor(n_estimators=param)
    test_score = np.sqrt(-cross_val_score(clf, train_x, train_y, cv=10, scoring='neg_mean_squared_error'))
    test_scores.append(np.mean(test_score))
print test_scores
plt.plot(params, test_scores)
plt.title("n_estimators vs CV Error");
# 一定要加上这句才能让画好的图显示在屏幕上
plt.show()
# 将当前figure的图保存到文件result.png
#plt.savefig('./xgboostparams.png')

# 91 16889
xgb = XGBRegressor(max_depth=6,n_estimators=400)
xgb.fit(X, y)
print mean_absolute_error(val_y,xgb.predict(val_x))
print(mean_squared_error(val_y,xgb.predict(val_x)))

#gbdt
'''
print "GradientBoostingRegressor"    
gbdt = GradientBoostingRegressor(n_estimators = 1000,max_leaf_nodes = 400)
gbdt.fit(X, y)#17083
#RandomForestRegressor 93  16938
#GradientBoostingRegressor 90 16866
#XGBRegressor 100 19939 
print mean_absolute_error(val_y,gbdt.predict(val_x))
print(mean_squared_error(val_y,gbdt.predict(val_x)))

# predict and save output
            x1 = joblib.load(data[1])
            X = pd.concat((x0, x1), axis=1)
            del x0, x1

        X_train = X[:num_train]
        X_test = X[num_train:]
        del X

        # X_train, X_val, y_train, y_val = train_test_split(X,
        #                                                  y,
        #                                                  test_size=0.1,
        #                                                  random_state=42)

        xgbr.fit(
            X_train,
            y_train,
            # eval_metric='rmse',
            # early_stopping_rounds=30,
            verbose=True,
            # eval_set=[(X_val, y_val)],
        )

        y_pred = xgbr.predict(X_test)

        res.append(y_pred)

    final_res = np.mean(res, axis=0)
    final_res[final_res < 1] = 1
    final_res[final_res > 3] = 3
    pd.DataFrame({"id": id_test, "relevance": final_res}).to_csv("xgbr_sub_20160423_bag.csv", index=False)
示例#10
0
# 1. dataset load
boston = load_boston()
x_names = boston.feature_names  # x 변수명
# array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

X, y = load_boston(return_X_y=True)
X.shape  # (506, 13)

y  # 비율척도, 비정규화

# 2. train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# 3. create model
xgb = XGBRegressor()
model = xgb.fit(x_train, y_train)
model  # objective='reg:squarederror'

# 4. 중요변수 시각화
fscore = model.get_booster().get_fscore()
fscore
'''
{'f5': 378,
 'f12': 254,
 'f0': 642,
 'f4': 135,
 'f7': 238,
 'f11': 289,
 'f8': 27,
 'f1': 60,
 'f3': 15,
示例#11
0
from xgboost import XGBRegressor
from sklearn.datasets import load_boston
from sklearn.metrics import r2_score

boston = load_boston()

x = boston.data
y = boston.target

x_train, x_test, y_train, y_test = tts(x, y, train_size=0.8, random_state=66)

xgb = XGBRegressor(n_estimators=10, learning_rate=0.1)

xgb.fit(x_train,
        y_train,
        verbose=True,
        eval_metric=["rmse", "logloss"],
        eval_set=[(x_train, y_train), (x_test, y_test)],
        early_stopping_rounds=20)
#rmse,mae,logloss,error,auc

y_pre = xgb.predict(x_test)

r2 = r2_score(y_test, y_pre)
score = xgb.score(x_test, y_test)
result = xgb.evals_result()
print(__file__)
print(result)
print("r2")
print(r2)
print("score")
print(score)
示例#12
0
    print('all featrues data loaded!')

    # training model
    data_info = train_rdd.map(lambda x: combine(x)).collectAsMap()
    data = np.array(list(data_info.values()))
    x, y = data[:, 1:], data[:, 0].reshape(-1, 1)
    print('training dataset ready! ')

    model = XGBRegressor(
        n_estimators=200,
        learning_rate=0.025,
        reg_alpha=0.1,
        max_depth=5,
    )
    model.fit(x, y)
    print('model fitted')

    mb_score_info = test_rdd.map(lambda x: recommend(
        x, rater_info, user_rating_info, user_avg_info, N=5, item_threshold=3
    )).map(lambda x: ((x[0], x[1]), x[2])).collectAsMap()

    # making prediction
    pred_info = test_rdd.map(lambda x: combine_pred(x))
    # res = test_rdd.map(lambda x: combine_pred(x)) \
    #     .map(lambda x: ((x[0][0], x[0][1]),
    #                     (model.predict(np.array(x[1]).reshape(1, -1))[0],
    #                      mb_score_info[(x[0][0], x[0][1])],
    #                      business_feats[x[0][1]][1]
    #                      ))) \
    #     .map(lambda x: (x[0][0], x[0][1], x[1][0] * (1 - 1 / x[1][2]) + x[1][1] * (1 / x[1][2]))).collect()
days = list(range(2, zones.loc[zones['Day'].idxmax()]['Day']))
test_days = [days[i] for i in random.sample(range(len(days)), int(len(days)*.3))]
train_days = list(set(days) - set(test_days))
query_1 = create_query(train_days)
query_2 = create_query(test_days)
train, test = zones.query(query_1), zones.query(query_2)
X_train, y_train = train.drop(drop + target, axis=1), train[target]
X_test, y_test = test.drop(drop + target, axis=1), test[target]

params = {'n_estimators': 500,
        'max_depth': 4,
        'min_samples_split': 2,
        'learning_rate': .01,
        'loss': 'ls'}
clf = XGBRegressor(**params)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print('MSE: {}'.format(mse))

steps = 10
predictions, y_test = predictions[0::steps], y_test.values[0::steps]
size = len(predictions)
for idx, (vals, name, color) in enumerate([(predictions, 'Prediction', '#d896ff'), (y_test, 'Actual', '#ffaaa5')]):
    ax = plt.subplot(2, 1, idx+1)
    ax.set_title(name)
    plt.plot(range(size), vals, lw=.75, color=color)
    plt.yticks(np.arange(0, 90, 15))
plt.tight_layout()
plt.savefig('./zone/xgboost.png')
示例#14
0
 'learning_rate': 0.12,
 'gamma': 0.0,
 'max_depth': 12,
 'min_child_weight': 1,
 'max_delta_weight': 20,
 'rate_drop': 0.0}
]

xgboost_params = xgboost_params_list[pred_index]


# In[115]:


xgb_model = XGBRegressor(**xgboost_params)
xgb_model.fit(x_train, y_train)

y_pred = xgb_model.predict(x_test)
predictions = [round(value) for value in y_pred]
mse = metric.mean_squared_error(y_test, predictions)
rmse = math.sqrt(mse)
print(rmse)


# # LGBM 

# In[69]:


import lightgbm
import lightgbm as lgb
示例#15
0
print('RMSE GradientBoostingRegressor: ', RMSLE(np.log1p(train['visitors'].values),
                                                model1.predict(train[col])))
print('RMSE KNeighborsRegressor: ', RMSLE(np.log1p(train['visitors'].values),
                                          model2.predict(train[col])))
#test['visitors'] = (model1.predict(test[col]) + model2.predict(test[col])) / 2
test['visitors'] = model2.predict(test[col])
test['visitors'] = np.expm1(test['visitors']).clip(lower=0.)
sub1 = test[['id','visitors']].copy()
#del train; del data;

sub1[['id', 'visitors']].to_csv(os.path.join(path_kaggle, 'naive_forecast2.csv'),
                                index = False)

from xgboost import XGBRegressor
model3 = XGBRegressor()
model3.fit(train[col], np.log1p(train['visitors'].values), verbose=False)
print('XGBRegressor: ', RMSLE(np.log1p(train['visitors'].values),
                              model3.predict(train[col])))

## from hklee
## https://www.kaggle.com/zeemeen/weighted-mean-comparisons-lb-0-497-1st/code
#dfs = { re.search('/([^/\.]*)\.csv', fn).group(1):
#    pd.read_csv(fn)for fn in glob.glob('../input/*.csv')}
#
#for k, v in dfs.items(): locals()[k] = v
#
#wkend_holidays = date_info.apply(
#    (lambda x:(x.day_of_week=='Sunday' or x.day_of_week=='Saturday') and x.holiday_flg==1), axis=1)
#date_info.loc[wkend_holidays, 'holiday_flg'] = 0
#date_info['weight'] = ((date_info.index + 1) / len(date_info)) ** 5  
#
    else:
        return 0


# and the transformation is applied on the test data for later use.
# The train data will be transformed while it is being fit.
y_test_binary = pd.DataFrame(y_test["value"].apply(getBinary))

regressorLow = XGBRegressor(gamma=0.0,
                            n_estimators=200,
                            base_score=0.5,
                            colsample_bytree=0.7,
                            learning_rate=0.2,
                            max_depth=5,
                            objective="reg:linear")
xgbModelLow = regressorLow.fit(X_train, y_train.value)
xgboost.plot_importance(xgbModelLow)
y_predicted = xgbModelLow.predict(X_test)

y_predicted_binary = [1 if yp >= 0.5 else 0 for yp in y_predicted]

print(accuracy_score(y_test_binary, y_predicted_binary))

fig = plt.figure(figsize=(8, 8))
plt.xticks(rotation='vertical')
y_pos = np.arange(len(xgbModelLow.feature_importances_))
plt.barh([i for i in range(len(xgbModelLow.feature_importances_))],
         xgbModelLow.feature_importances_.tolist(),
         align='center',
         alpha=0.4)
plt.yticks(y_pos, X_test.columns)
示例#17
0
dataset = pd.concat([dataset, dataset_credits], axis=1)

dataset = encode_json_column(dataset, 22, "name", 500, 1)

y = dataset.iloc[:, 18].values  #12 for revenue, 18 for rating
X = dataset.iloc[:, 23:].values
X_names = dataset.columns[23:].values

from xgboost import XGBRegressor
regressor = XGBRegressor(colsample_bytree=0.6,
                         gamma=0.7,
                         max_depth=4,
                         min_child_weight=5,
                         subsample=0.8,
                         objective='reg:squarederror')
regressor.fit(X, y)

importances = {}

count = 0
for feature_importance in regressor.feature_importances_:
    if feature_importance > 0.002:
        feature_name = X_names[count]
        importances[feature_name] = feature_importance
    count += 1

import operator
sorted_importances = sorted(importances.items(),
                            key=operator.itemgetter(1),
                            reverse=True)
    print('Feature:%0d -> %s, Score: %.5f' % (i, feature_names[i], v))

# plot feature importance
pyplot.bar([x for x in range(len(rf_importance))], rf_importance)
pyplot.show()

print('Accuracy of Random Forest regressor on training set: {:.2f}'.format(
    rf_reg.score(X_train, y_train)))
print('Accuracy of Random Forest regressor on test set: {:.2f}'.format(
    rf_reg.score(X_test, y_test)))
"""**XGBoost Regression Feature Importance**"""

from xgboost import XGBRegressor

xgb_reg = XGBRegressor()
xgb_reg.fit(X_train, y_train)

xgb_importance = xgb_reg.feature_importances_
for i, v in enumerate(xgb_importance):
    print('Feature:%0d -> %s, Score: %.5f' % (i, feature_names[i], v))

# plot feature importance
pyplot.bar([x for x in range(len(xgb_importance))], xgb_importance)
pyplot.show()

print('Accuracy of Xgboost regressor on training set: {:.2f}'.format(
    xgb_reg.score(X_train, y_train)))
print('Accuracy of Xgboost regressor on test set: {:.2f}'.format(
    xgb_reg.score(X_test, y_test)))
"""**Permutation Feature Importance for Regression**"""
seed=3
test_size=.3

X_train, X_test, y_train, y_test = train_test_split(X, log_loss, test_size=test_size, random_state=seed)


model=XGBRegressor(learning_rate=0.08,
                   max_depth=10,
                   objective='reg:linear',
                   nthread=3,
                   gamma=0.2,
                   subsample=0.9,
                   n_estimators=100,
                   )
model.fit(X_train, y_train)
print(model)
y_pred=model.predict(X_test)

def mae(predicted, actual, logscale=False):
    if logscale == True:
        predexp=np.exp(predicted)
        actualexp=np.exp(actual)
        return np.mean(np.abs(predexp - actualexp))
    else:
        return np.mean(np.abs(predicted - actual))

print(mae(y_pred, y_test, True))


# #Plotting Variable Importance
示例#20
0
import pandas as pd
from xgboost import XGBRegressor

train_path = 'src/RL/outputs/train_data.csv'
train_data = pd.read_csv(train_path)

model = XGBRegressor()

train_x = train_data.iloc[:, 0:-1]
train_y = train_data[['rewards']]

model.fit(train_x, train_y)

print(model)
示例#21
0
y = dataset['target']
print(y.shape)
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=66)

# 2. 모델
model = XGBRegressor(n_estimators=1000, learning_rate=0.01,
                     n_jobs=8)  # 이렇게 튜닝 할 줄 알기

# 3. 훈련
model.fit(x_train,
          y_train,
          verbose=1,
          eval_metric=['rmse'],
          eval_set=[(x_train, y_train), (x_test, y_test)],
          early_stopping_rounds=20)  # eval_metric='rmse' 회귀인 경우

# 평가
aaa = model.score(x_test, y_test)
print('score: ', aaa)

y_pred = model.predict(x_test)
r2 = r2_score(y_test, y_pred)  #score 할 때 원데이터 (y_test를 앞에 넣기)
print('r2 : ', r2)

## 이발셋 =============================
print('====================================')
result = model.evals_result()
# print('result: ', result)
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from xgboost import XGBRegressor

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

X = train.iloc[:, 1:-1].values
y = train.iloc[:, -1].values
model = XGBRegressor(learning_rate=0.03, max_depth=4, n_estimators=100)
model.fit(X, y)
test_feature = test.iloc[:, 1:].values
prediction = model.predict(test_feature)
result = pd.DataFrame({'ID': test.ID, 'medv': prediction})
result.to_csv("result.csv", index=False)
# using one hot for types of Object
X = pd.get_dummies(X)

# align the one-hot testing data according to the training data
# test_data = pd.read_csv('test.csv')
# test_X = test_data.drop(['Id'], axis=1)
#
# test_X = pd.get_dummies(test_X)
# test_X = X.align(test_X,join='left',axis=1)

# split the data into training and validation set
train_X, validation_X, train_y, validation_y = train_test_split(X.values,
                                                                y.values,
                                                                test_size=0.25)

my_imputer = Imputer()
train_X = my_imputer.fit_transform(train_X)
validation_X = my_imputer.transform(validation_X)

my_model = XGBRegressor(n_estimators=1000,
                        learning_rate=0.01,
                        early_stopping_rounds=5,
                        n_jobs=4)
# Add silent=True to avoid printing out updates with each cycle
my_model.fit(train_X, train_y, verbose=False)

# make predictions
predictions = my_model.predict(validation_X)
rms = np.sqrt(mean_squared_error(validation_y, predictions))

print(rms)
示例#24
0
#results = {}
#for i in [50, 100, 150, 200, 250, 300, 350, 400]:
#    results[i] = get_score(i)

# the best n_estimator is 200

#print("Mean Absolute Error for RandomForestRegressor with Cross Validation: " + str(get_score()))

import matplotlib.pyplot as plt

#plt.plot(results.keys(), results.values())
#plt.show

from xgboost import XGBRegressor

xgb_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
xgb_model.fit(X_train,
              y_train,
              early_stopping_rounds=5,
              eval_set=[(X_valid, y_valid)],
              verbose=False)

xgb_preds = xgb_model.predict(X_valid)
print("Mean Absolute Error for XGBoosting: " +
      str(mean_absolute_error(xgb_preds, y_valid)))

X.loc[:,
      'Elevation':'Horizontal_Distance_To_Fire_Points'].hist(bins=50,
                                                             figsize=(20, 15))
plt.show
示例#25
0
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=66)

#2. model
model = XGBRegressor(n_estimators=1000,
                     learning_rate=0.01,
                     n_jobs=8,
                     use_label_encoder=False)

#3. fit
model.fit(x_train,
          y_train,
          verbose=1,
          eval_metric=['rmse', 'logloss', 'mae'],
          eval_set=[(x_train, y_train), (x_test, y_test)],
          early_stopping_rounds=10)
# 이 한 부분만 추가하면 된다, eval_metric에 들어가는 파라미터의 순서도 중요함 -> 마지막에 위치한 metric 기준으로 early_stopping 적용함

#4. score and predict
aaa = model.score(x_test, y_test)
print("aaa : ", aaa)

y_pred = model.predict(x_test)
r2 = r2_score(y_test, y_pred)
print("r2 : ", r2)

print("=====================")
results = model.evals_result()
print(results)
示例#26
0
文件: Model.py 项目: dlont/kbc
class VanillaModelRegression(Model):
    def __init__(self, configuration):
        self._configuration = configuration
        self._objects = {}
        self._annotation = 'Performance comparision of different MVA discriminants'
        if 'annotation' in self._configuration:
            self._annotation = self._configuration['annotation']
        self.my_model = None
        self.fit_results = None
        self.Initialize()

    @log_with()
    def Initialize(self):
        self.build_best_prediction()
        pass

    @log_with()
    def get(self, name):
        """
                Factory method
                """
        if name in self._objects:
            return self._objects[name]
        else:
            return None  #provide factory method implementation here
        return self._objects[name]

    @log_with()
    def get_data_provider(self, provider_name):
        """
                Factory method for data providers
                """
        from dataprovider import PandasDataProviderFromCSV_original
        if provider_name in self._objects:
            return self._objects[provider_name]
        else:
            if '.csv' in self._configuration[provider_name]['input_file']:
                provider = PandasDataProviderFromCSV_original(
                    self._configuration[provider_name]['input_file'])
                self._objects[provider_name] = provider
            else:
                raise NotImplementedError
        return self._objects[provider_name]

    @log_with()
    def build_best_prediction(self):
        print("Dummy building vanilla model!")

        from matplotlib import pyplot
        from xgboost import XGBRegressor, plot_importance
        # from sklearn.metrics import explained_variance_score, max_error, mean_absolute_error, mean_squared_error
        from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error

        target_variable_names = self._configuration['model']['target'][0]
        data_provider = self.get_data_provider(
            self._configuration['model']['data_provider'])

        input_features_names = self._configuration['model']['input_features']
        X_train = data_provider.train[input_features_names]
        y_train = data_provider.train[target_variable_names]

        X_test = data_provider.test[input_features_names]
        y_test = data_provider.test[target_variable_names]

        # print X_train.dtypes
        # print X_train.head()
        # print X_test.dtypes
        # print X_test.head()

        # print y_train.dtypes
        # print y_train.head()
        # print y_test.dtypes
        # print y_test.head()

        eval_set = [(X_train, y_train), (X_test, y_test)]

        self.my_model = XGBRegressor(
            n_estimators=self._configuration['model']['n_estimators'],
            max_depth=self._configuration['model']['max_depth'],
            learning_rate=self._configuration['model']['learning_rate'],
            verbosity=0)
        self.my_model.fit(X_train,
                          y_train,
                          eval_metric=["rmse", "mae"],
                          eval_set=eval_set,
                          verbose=False)

        y_pred = my_model.predict(X_test)
        # print "Max error: ", max_error(y_test,y_pred)
        print("Explained variance score: ",
              explained_variance_score(y_test, y_pred))
        print("Mean absolute error: ", mean_absolute_error(y_test, y_pred))
        print("Mean squared error: ", mean_squared_error(y_test, y_pred))

        self.fit_results = self.my_model.evals_result()
        # print 'YO importance'
        # plot_importance(my_model)
        pickle.dump(
            self.my_model,
            open(self._configuration['model']['output_filename'], 'wb'))

        pass
示例#27
0
dt = DecisionTreeRegressor(random_state=5)
dt.fit(X_train, y_train)

accuracy = dt.score(X_val, y_val)

y_pred = dt.predict(X_val)

rmse = np.sqrt(mean_squared_error(y_val, y_pred))

print('Accuracy', accuracy)
print('rmse', rmse)

# --------------
from xgboost import XGBRegressor

# Code starts here
xgb = XGBRegressor(max_depth=50, learning_rate=0.83, n_estimators=100)

xgb.fit(X_train, y_train)

accuracy = xgb.score(X_val, y_val)

y_pred = xgb.predict(X_val)

rmse = np.sqrt(mean_squared_error(y_val, y_pred))

print('Accuracy', accuracy)
print('rmse', rmse)

# Code ends here
示例#28
0
    '../../../../data/rossmann/intermediate/06_SalesModelingExtendedWeather/03_Store198/test_store198_X_3M.pkl'
)
y_test_3M_198 = pd.read_pickle(
    '../../../../data/rossmann/intermediate/06_SalesModelingExtendedWeather/03_Store198/test_store198_y_3M.pkl'
)
# Store 897
X_test_3M_897 = pd.read_pickle(
    '../../../../data/rossmann/intermediate/06_SalesModelingExtendedWeather/04_Store897/test_store897_X_3M.pkl'
)
y_test_3M_897 = pd.read_pickle(
    '../../../../data/rossmann/intermediate/06_SalesModelingExtendedWeather/04_Store897/test_store897_y_3M.pkl'
)

# Fit Model All Stores
xgb_model_all = XGBRegressor(n_estimators=100, learning_rate=0.1)
xgb_model_all.fit(X_train, y_train)

# Save Model All Stores
model_all_filename = "../../04_Evaluation/00_Models/xgb_model_all.pkl"
with open(model_all_filename, 'wb') as file:
    pickle.dump(xgb_model_all, file)

# Fit Model Store 708
xgb_model_708 = XGBRegressor(n_estimators=100, learning_rate=0.1)
xgb_model_708.fit(X_train_708, y_train_708)

# Save Model Store 708
model_708_filename = "../../04_Evaluation/00_Models/xgb_model_708.pkl"
with open(model_708_filename, 'wb') as file:
    pickle.dump(xgb_model_708, file)
示例#29
0
def main():
    print("Loading data...")
    # The training data is used to train your model how to predict the targets.
    training_data = read_csv("numerai_training_data.csv")
    # The tournament data is the data that Numerai uses to evaluate your model.
    tournament_data = read_csv("numerai_tournament_data.csv")

    feature_names = [
        f for f in training_data.columns if f.startswith("feature")
    ]
    print(f"Loaded {len(feature_names)} features")

    # This is the model that generates the included example predictions file.
    # Taking too long? Set learning_rate=0.1 and n_estimators=200 to make this run faster.
    # Remember to delete example_model.xgb if you change any of the parameters below.
    model = XGBRegressor(max_depth=5,
                         learning_rate=0.01,
                         n_estimators=2000,
                         n_jobs=-1,
                         colsample_bytree=0.1)
    if MODEL_FILE.is_file():
        print("Loading pre-trained model...")
        model.load_model(MODEL_FILE)
    else:
        print("Training model...")
        model.fit(training_data[feature_names], training_data[TARGET_NAME])
        model.save_model(MODEL_FILE)

    # Generate predictions on both training and tournament data
    print("Generating predictions...")
    try:
        training_data[PREDICTION_NAME] = model.predict(
            training_data[feature_names])
        tournament_data[PREDICTION_NAME] = model.predict(
            tournament_data[feature_names])
    except Exception as e:
        print(e)
        print(
            "If you received the error 'Floating point is not supported', this is likely due to using version >=1.4 of XGBoost"
        )
        print(
            "Downgrade to XGBoost 1.3.3 by typing the following into your command line"
        )
        print("pip install xgboost==1.3.3")
        print("\nAlternatively, change the lines that start with")
        print("training_data =...")
        print("tournament_data =...")
        print("\nTo the following")
        print(
            "training_data = pd.read_parquet(\"s3://numerai-public-datasets/latest_numerai_training_data.parquet\")"
        )
        print(
            "training_data = pd.read_parquet(\"s3://numerai-public-datasets/latest_numerai_tournament_data.parquet\")"
        )
        print("\nThis will require more RAM")

    # Check the per-era correlations on the training set (in sample)
    train_correlations = training_data.groupby("era").apply(score)
    print(
        f"On training the correlation has mean {train_correlations.mean()} and std {train_correlations.std(ddof=0)}"
    )
    print(
        f"On training the average per-era payout is {payout(train_correlations).mean()}"
    )
    """Validation Metrics"""
    # Check the per-era correlations on the validation set (out of sample)
    validation_data = tournament_data[tournament_data.data_type ==
                                      "validation"]
    validation_correlations = validation_data.groupby("era").apply(score)
    print(
        f"On validation the correlation has mean {validation_correlations.mean()} and "
        f"std {validation_correlations.std(ddof=0)}")
    print(
        f"On validation the average per-era payout is {payout(validation_correlations).mean()}"
    )

    # Check the "sharpe" ratio on the validation set
    validation_sharpe = validation_correlations.mean(
    ) / validation_correlations.std(ddof=0)
    print(f"Validation Sharpe: {validation_sharpe}")

    print("checking max drawdown...")
    rolling_max = (validation_correlations + 1).cumprod().rolling(
        window=100, min_periods=1).max()
    daily_value = (validation_correlations + 1).cumprod()
    max_drawdown = -((rolling_max - daily_value) / rolling_max).max()
    print(f"max drawdown: {max_drawdown}")

    # Check the feature exposure of your validation predictions
    feature_exposures = validation_data[feature_names].apply(
        lambda d: correlation(validation_data[PREDICTION_NAME], d), axis=0)
    max_per_era = validation_data.groupby("era").apply(
        lambda d: d[feature_names].corrwith(d[PREDICTION_NAME]).abs().max())
    max_feature_exposure = max_per_era.mean()
    print(f"Max Feature Exposure: {max_feature_exposure}")

    # Check feature neutral mean
    print("Calculating feature neutral mean...")
    feature_neutral_mean = get_feature_neutral_mean(validation_data)
    print(f"Feature Neutral Mean is {feature_neutral_mean}")

    # Load example preds to get MMC metrics
    example_preds = pd.read_csv("example_predictions.csv").set_index(
        "id")["prediction"]
    validation_example_preds = example_preds.loc[validation_data.index]
    validation_data["ExamplePreds"] = validation_example_preds

    print("calculating MMC stats...")
    # MMC over validation
    mmc_scores = []
    corr_scores = []
    for _, x in validation_data.groupby("era"):
        series = neutralize_series(pd.Series(unif(x[PREDICTION_NAME])),
                                   pd.Series(unif(x["ExamplePreds"])))
        mmc_scores.append(np.cov(series, x[TARGET_NAME])[0, 1] / (0.29**2))
        corr_scores.append(
            correlation(unif(x[PREDICTION_NAME]), x[TARGET_NAME]))

    val_mmc_mean = np.mean(mmc_scores)
    val_mmc_std = np.std(mmc_scores)
    val_mmc_sharpe = val_mmc_mean / val_mmc_std
    corr_plus_mmcs = [c + m for c, m in zip(corr_scores, mmc_scores)]
    corr_plus_mmc_sharpe = np.mean(corr_plus_mmcs) / np.std(corr_plus_mmcs)
    corr_plus_mmc_mean = np.mean(corr_plus_mmcs)
    corr_plus_mmc_sharpe_diff = corr_plus_mmc_sharpe - validation_sharpe

    print(f"MMC Mean: {val_mmc_mean}\n"
          f"Corr Plus MMC Sharpe:{corr_plus_mmc_sharpe}\n"
          f"Corr Plus MMC Diff:{corr_plus_mmc_sharpe_diff}")

    # Check correlation with example predictions
    full_df = pd.concat([
        validation_example_preds, validation_data[PREDICTION_NAME],
        validation_data["era"]
    ],
                        axis=1)
    full_df.columns = ["example_preds", "prediction", "era"]
    per_era_corrs = full_df.groupby('era').apply(
        lambda d: correlation(unif(d["prediction"]), unif(d["example_preds"])))
    corr_with_example_preds = per_era_corrs.mean()
    print(f"Corr with example preds: {corr_with_example_preds}")

    # Save predictions as a CSV and upload to https://numer.ai
    tournament_data[PREDICTION_NAME].to_csv("submission.csv", header=True)
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

svr = make_pipeline(StandardScaler(), SVR(kernel='rbf', C=1000))
svr.fit(X, y)

-1 * cross_val_score(svr, X, y, cv=5, scoring='neg_mean_absolute_error').mean()
# -

# # Extreme Gradient Boosting

# +
from xgboost import XGBRegressor

xgb = XGBRegressor(learning_rate=0.1)
xgb.fit(X, y)

-1 * cross_val_score(xgb, X, y, cv=5, scoring='neg_mean_absolute_error').mean()
# -

# # Light Gradient Boosting

# +
from lightgbm import LGBMRegressor

lgbm = LGBMRegressor(learning_rate=0.1, n_estimators=1000)
lgbm.fit(X, y)

-1 * cross_val_score(lgbm, X, y, cv=5, scoring='neg_mean_absolute_error').mean()
# -
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:-0.13400621878282912
exported_pipeline = XGBRegressor(booster="dart",
                                 learning_rate=0.1,
                                 max_depth=4,
                                 n_estimators=300,
                                 n_jobs=-1,
                                 objective="reg:linear")

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
示例#32
0
X = data.drop(["id", "血糖", "体检日期"], axis=1)
Y = data["血糖"]
Y = np.log1p(Y)

for column in X.columns:
    X[column] = np.log1p(X[column])

for column in test.columns:
    test[column] = np.log1p(test[column])

clf = XGBRegressor()
print("---111----")
kfold = KFold(n_splits=5, random_state=7)
test_score = np.sqrt(
    -cross_val_score(clf, X, Y, cv=kfold, scoring='neg_mean_squared_error'))
print("------test_score--------")
print(test_score)
print(np.mean(test_score))
print("---2----")
clf.fit(X, Y)
print("---3----")
pred = np.expm1(clf.predict(test))
pred_df = pd.DataFrame()
pred_df["pred"] = pred

pred_df.to_csv(
    '/Users/jianjun.yue/PycharmGItHub/data/人工智能辅助糖尿病遗传风险预测/sub_0107_XG_log1p.csv',
    header=False,
    index=False,
    float_format='%.3f')
y = data.SalePrice
X = data.drop(['SalePrice'], axis=1).select_dtypes(exclude=['object'])

X_train, X_test, y_train, y_test = train_test_split(X.as_matrix(),
                                                    y.as_matrix(),
                                                    test_size=0.25)

my_imputer = Imputer()

X_train_impute = my_imputer.fit_transform(X_train)
X_test_impute = my_imputer.transform(X_test)

#1st model
my_model = XGBRegressor()

my_model.fit(X_train, y_train, verbose=False)

predictions_1 = my_model.predict(X_test)

print("Normal Error:" + str(mean_absolute_error(predictions_1, y_test)))

#2nd model

my_model = XGBRegressor(n_estimators=10000, learning_rate=0.05)

my_model.fit(X_train,
             y_train,
             early_stopping_rounds=5,
             eval_set=[(X_test, y_test)],
             verbose=False)
def float_col_objective(
    trial,
    X,
    y,
    random_state=22,
    n_splits=3,
    n_repeats=1,
    n_jobs=-1,  # SOURCE https://discuss.xgboost.ai/t/n-jobs-1-no-longer-uses-all-cores/1955/6
    early_stopping_rounds=50):
    # ANCHOR XGBoost parameters
    params = {
        "objective":
        "reg:squarederror",  # NOTE Learning task parameters - https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters
        "n_estimators": trial.suggest_int("n_estimators", 50, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.005,
                                                  0.05),
        "colsample_bytree": trial.suggest_loguniform("colsample_bytree", 0.2,
                                                     0.8),
        "subsample": trial.suggest_loguniform("subsample", 0.4, 0.8),
        "alpha": trial.suggest_loguniform("alpha", 0.01, 10.0),
        "lambda": trial.suggest_loguniform("lambda", 1e-8, 10.0),
        "gamma": trial.suggest_loguniform(
            "lambda", 1e-8, 10.0
        ),  # Minimum loss reduction required to make a further partition on a leaf node of the tree.
        "min_child_weight": trial.suggest_loguniform(
            "min_child_weight", 10, 1000
        ),  # A smaller value is chosen because it is a highly imbalanced class problem and leaf nodes can have smaller size groups.
        # "scale_pos_weight": 1, # because of high class imbalance
        "verbosity": 0,  # 0 (silent) - 3 (debug)
        "seed": random_state,
        "tree_method": 'gpu_hist'  # NOTE 
    }

    xgb_model = XGBRegressor(**params)
    pruning_callback = optuna.integration.XGBoostPruningCallback(
        trial, "validation_0-rmse"
    )  # NOTE observation_keys - https://optuna.readthedocs.io/en/stable/reference/generated/optuna.integration.XGBoostPruningCallback.html
    # pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation_0-auc")
    # CONSTRUCTION
    # NOTE oof - https://www.kaggle.com/vinhnguyen/accelerating-xgboost-with-gpu
    rkf = RepeatedKFold(n_splits=n_splits,
                        n_repeats=n_repeats,
                        random_state=random_state)
    X_values = X.values
    y_values = y.values
    y_oof = np.zeros_like(y_values)
    res = 0
    for fold, (train_index,
               valid_index) in enumerate(rkf.split(X_values, y_values)):
        X_A, X_B = X_values[train_index, :], X_values[valid_index, :]
        y_A, y_B = y_values[train_index], y_values[valid_index]
        xgb_model.fit(
            X_A,
            y_A,
            eval_set=[(X_B, y_B)],
            eval_metric=
            "rmse",  # NOTE Learning task parameters - https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters 此用於檢視callback事件是否出現,不是直接用於找optuna hyperparameter 
            early_stopping_rounds=early_stopping_rounds,
            callbacks=[pruning_callback],
            verbose=0)
        y_pred = xgb_model.predict(X_B)
        y_oof[valid_index] += y_pred
        res += np.sqrt(mean_squared_error(y_pred, y_B)) / (n_splits * n_repeats
                                                           )  # I added it.

    y_oof /= n_repeats  # Original
    trial.set_user_attr(
        key="best_booster",
        value=xgb_model)  # NOTE update the best model in the optuna's table.
    # return np.sqrt(mean_squared_error(y_values, y_pred)) # Originally, the author uses y_train. I think it's incorrect.
    return res  # I changed the last line to this one.
#           alpha = 1,
#           gamma = 2,
#           min_child_weight = 1,
#           base_score = 7.76
#           nrounds=5000,
#           nfold=5,
#           early_stopping_rounds=15,
#           print_every_n = 10,
#           verbose= 1,
#           feval=xg_eval_mae,
#           maximize=FALSE
#           )

    folds = KFold(n_splits=3, shuffle=False)
    for k, (train_index, test_index) in enumerate(folds.split(train_xg_x)):
        xtr = train_xg_x[train_index]
        ytr = train_xg_y[train_index]
        xtest = train_xg_x[test_index]
        ytest = train_xg_y[test_index]
        print "Fitting on fold {}...".format(k)
        print "Checking xtest shape: ", xtest.shape
        print "Checking ytest shape: ", ytest.shape
        xgboosting.fit(xtr, ytr, verbose=True)
        np.savetxt('xgb_pred_fold_{}.txt'.format(k), np.exp(xgboosting.predict(xtest)))
        np.savetxt('xgb_test_fold_{}.txt'.format(k), ytest)

    # Training xgboost on test set (i.e. whole train set).
    xgboosting.fit(train_xg_x, train_xg_y, verbose=True)
    print "Fitting on test set..."
    np.savetxt('xgb_pred_test.txt', np.exp(xgboosting.predict(test_xg_x)))
示例#36
0
def fitmodel(train, test, verbose=0, train_model=False, plot_graph=False):
    trainY = train['price']
    testY = test['price']
    trainX = train.drop(['price'], axis=1)
    testX = test.drop(['price'], axis=1)

    if train_model == True:
        print("Training model...")
        params = {
            "max_depth": st.randint(3, 40),
            "colsample_bytree": st.beta(10, 1),
            "subsample": st.beta(10, 1),
            "gamma": st.uniform(0, 10),
            "min_child_weight": st.expon(0, 50),
        }
        gboost = XGBRegressor(n_estimators=5, learning_rate=.2)
        tmp = RandomizedSearchCV(gboost,
                                 params,
                                 cv=10,
                                 n_jobs=-1,
                                 verbose=verbose,
                                 n_iter=25)
        tmp.fit(trainX, trainY)
        print("Optimised parameters: ")
        print(tmp.best_params_)
        gboost_opt = tmp.best_estimator_
        gboost_opt.set_params(n_estimators=100, learning_rate=.1, n_jobs=-1)
    else:
        gboost_opt = XGBRegressor(base_score=0.5,
                                  booster='gbtree',
                                  colsample_bylevel=1,
                                  colsample_bytree=0.87219466652443045,
                                  gamma=7.0610396795642156,
                                  learning_rate=0.1,
                                  max_delta_step=0,
                                  max_depth=23,
                                  min_child_weight=13.539302225736687,
                                  missing=None,
                                  n_estimators=100,
                                  n_jobs=-1,
                                  nthread=None,
                                  objective='reg:linear',
                                  random_state=0,
                                  reg_alpha=0,
                                  reg_lambda=1,
                                  scale_pos_weight=1,
                                  seed=None,
                                  silent=True,
                                  subsample=0.95498622807161138)

    print("Final model: ")
    print(gboost_opt)

    gboost_opt.fit(trainX, trainY)
    trainY_pred = gboost_opt.predict(trainX)
    testY_pred = gboost_opt.predict(testX)
    print("Performance metrics: \n")
    print("RMSE : %.4f (train) %.4f (test)" % (mean_squared_error(
        trainY, trainY_pred)**.5, mean_squared_error(testY, testY_pred)**.5))
    print("MAE : %.4f (train) %.4f (test)" % (mean_absolute_error(
        trainY, trainY_pred), mean_absolute_error(testY, testY_pred)))
    print("MedianAE : %.4f (train) %.4f (test)" % (median_absolute_error(
        trainY, trainY_pred), median_absolute_error(testY, testY_pred)))
    train_err = np.absolute(trainY - trainY_pred) / trainY
    test_err = np.absolute(testY - testY_pred) / testY
    print("Mean Absolute Percentage Error: %.4f (train) %.4f (test)" %
          (np.mean(train_err), np.mean(test_err)))
    print("Median Absolute Percentage Error: %.4f (train) %.4f (test)" %
          (np.median(train_err), np.median(test_err)))

    th = [.01, .05, .1, .2, .3]
    train_err_vec = np.zeros(len(th))
    test_err_vec = np.zeros(len(th))
    for i in range(len(th)):
        train_err_vec[i] = np.sum((train_err < th[i] * 1)) / len(train_err)
        test_err_vec[i] = np.sum((test_err < th[i] * 1)) / len(test_err)
        print(
            "Absolute Percentage Error within %.2f: %.2f (train), %.2f (test)"
            % (th[i], train_err_vec[i], test_err_vec[i]))

    if plot_graph == True:
        feat_imp = pd.Series(gboost_opt.feature_importances_,
                             list(trainX)).sort_values(ascending=False)
        feat_imp.plot(kind='bar', title='Feature Importances')
        plt.ylabel('Feature Importance Score')
        plt.show()
示例#37
0
# -*- coding: utf-8 -*-

from xgboost import XGBRegressor
import pandas as pd

train = pd.read_csv("C:\\Users\\jowet\\Downloads\\Santander\\train.csv")
test = pd.read_csv("C:\\Users\\jowet\\Downloads\\Santander\\test.csv")

train.drop('ID', axis=1, inplace=True)

y_train = train.pop('target')
pred_index = test.pop('ID')

reg = XGBRegressor()
reg.fit(train, y_train)
y_pred = reg.predict(test)

submit = pd.DataFrame()
submit['ID'] = pred_index
submit['target'] = y_pred
submit.to_csv('my_XGB_prediction.csv', index=False)
示例#38
0
#
#split train and test
print("split")
train = df[df.index.get_level_values(1) <= '2016-10-23']
test = df[df.index.get_level_values(1) >= '2016-11-01']

train_X, train_y = train[features], train[target]
test_X, test_y = test[features], test[target]

print("model")

from xgboost import XGBRegressor
from smape import XGBsmape

xgbr = XGBRegressor(max_depth=9,
                    learning_rate=0.05,
                    n_estimators=1000,
                    silent=True,
                    objective='reg:linear',
                    nthread=-1,
                    subsample=0.8,
                    colsample_bytree=0.8)

xgbr.fit(train_X,
         train_y,
         eval_set=[(train_X, train_y), (test_X, test_y)],
         eval_metric=XGBsmape,
         early_stopping_rounds=10,
         verbose=True)
示例#39
0
df_valid = df_valid.set_index('key_0')

typ = df_valid.dtypes
df_valid.to_csv('df_valid_cat.csv', header=None, index=False)

df_train.columns
RFR = RandomForestRegressor()
RFR.fit(X_train, Y_train)

RFR_preds = pd.DataFrame(RFR.predict(X_test),columns=['salePrice'],index=Y_test.index)
print(mean_absolute_error(Y_test, RFR_preds))

RFR_new = RFR_preds.apply(lambda x: np.power(np.e,x).astype('int64'))

XGB = XGBRegressor()
XGB.fit(X_train, Y_train, verbose=False)
XGB_preds = pd.DataFrame(XGB.predict(X_test),columns=['salePrice'],index=Y_test.index).astype(int)
print(mean_absolute_error(Y_test,XGB_preds))

XGB_new = XGB_preds.apply(lambda x: np.power(np.e,x).astype('int64'))

GBR = GradientBoostingRegressor()
GBR.fit(X_train, Y_train)
GBR_preds = pd.DataFrame(GBR.predict(X_test),columns=['salePrice'],index=Y_test.index)
print(mean_absolute_error(Y_test,GBR_preds))

GBR_new = GBR_preds.apply(lambda x: np.power(np.e,x).astype('int64'))

sns.swarmplot(x=GBR_preds['salePrice'],y=Y_test)
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
示例#40
0
y_predict = model.predict(x_test)
print("최종 정답률     : ", r2_score(y_test, y_predict))

model = model.best_estimator_

thresholds = np.sort(model.feature_importances_)
print(thresholds)

n = 0
r2 = 0
for thresh in thresholds:
    selection = SelectFromModel(model, threshold=thresh, prefit=True)

    select_x_train = selection.transform(x_train)
    selection_model = XGBRegressor(n_jobs=-1)
    selection_model.fit(select_x_train, y_train)

    select_x_test = selection.transform(x_test)
    y_predict = selection_model.predict(select_x_test)

    score = r2_score(y_test, y_predict)
    if score * 100.0 > r2:
        n = select_x_train.shape[1]
        r2 = score * 100.0
        L_selection = selection
        print("Thresh=%.3f, n=%d, R2: %.2f%%" %
              (thresh, select_x_train.shape[1], score * 100.0))

x_train = L_selection.transform(x_train)
x_test = L_selection.transform(x_test)
class PrudentialRegressorCVO(BaseEstimator, RegressorMixin):
    def __init__(self,
                objective='reg:linear',
                learning_rate=0.045,
                min_child_weight=50,
                subsample=0.8,
                colsample_bytree=0.7,
                max_depth=7,
                n_estimators=700,
                nthread=-1,
                seed=0,
                n_buckets=8,
                initial_params=[-1.5, -2.6, -3.6, -1.2, -0.8, 0.04, 0.7, 3.6,
                                #1., 2., 3., 4., 5., 6., 7.
                                ],
                minimizer='BFGS',
                scoring=NegQWKappaScorer):

        self.objective = objective
        self.learning_rate = learning_rate
        self.min_child_weight = min_child_weight
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.max_depth = max_depth
        self.n_estimators = n_estimators
        self.nthread = nthread
        self.seed = seed
        self.n_buckets = n_buckets
        self.initial_params = initial_params
        self.minimizer = minimizer
        self.scoring = scoring

        return


    def fit(self, X, y):
        from xgboost import XGBRegressor
        if not KAGGLE:
            from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor

        #from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor
        #self.off = FullDigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets,
        #               basinhopping=True,

        """
2 / 5
grid scores:
  mean: 0.65531, std: 0.00333, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65531

3 / 5
grid scores:
  mean: 0.65474, std: 0.00308, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65474

4 / 5
grid scores:
  mean: 0.65490, std: 0.00302, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65490


2 / 10
grid scores:
  mean: 0.65688, std: 0.00725, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65688

3 / 10
grid scores:
  mean: 0.65705, std: 0.00714, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65705

4 / 10
grid scores:
  mean: 0.65643, std: 0.00715, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65643

5 / 10
grid scores:
  mean: 0.65630, std: 0.00699, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65630

        """
        from sklearn.cross_validation import StratifiedKFold
        kf = StratifiedKFold(y, n_folds=2)
        print(kf)
        params = []
        for itrain, itest in kf:
            ytrain = y[itrain]
            Xtrain = X.iloc[list(itrain)]
            ytest = y[itest]
            Xtest = X.iloc[list(itest)]

            self.xgb = XGBRegressor(
                           objective=self.objective,
                           learning_rate=self.learning_rate,
                           min_child_weight=self.min_child_weight,
                           subsample=self.subsample,
                           colsample_bytree=self.colsample_bytree,
                           max_depth=self.max_depth,
                           n_estimators=self.n_estimators,
                           nthread=self.nthread,
                           missing=0.0,
                           seed=self.seed)
            self.xgb.fit(Xtrain, ytrain)
            te_y_hat = self.xgb.predict(Xtest,
                                        ntree_limit=self.xgb.booster().best_iteration)
            print('XGB Test score is:', -self.scoring(te_y_hat, ytest))

            self.off = DigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets,
                           initial_params=self.initial_params,
                           minimizer=self.minimizer,
                           scoring=self.scoring)
            self.off.fit(te_y_hat, ytest)
            print("Offsets:", self.off.params)
            params += [list(self.off.params)]

            pass

        from numpy import array
        self.off.params = array(params).mean(axis=0)
        print("Mean Offsets:", self.off.params)
        self.xgb.fit(X, y)

        return self


    def predict(self, X):
        from numpy import clip
        te_y_hat = self.xgb.predict(X, ntree_limit=self.xgb.booster().best_iteration)
        return clip(self.off.predict(te_y_hat), 1, 8)

    pass
def ValidateTrainTestErrorsWithDifferentModels(cvX_train, cvX_test, cvy_train, cvy_test,X_train,y_train,X_test):
    clfs = list()
    cvClfs = list()

    print "Building RF1"
    rfShortCV = ensemble.RandomForestRegressor(min_samples_split=50,n_estimators=1000, max_depth=None, min_samples_leaf=50, max_features="auto", n_jobs=-1, random_state=0)
    rfShort = ensemble.RandomForestRegressor(min_samples_split=50,n_estimators=1000, max_depth=None, min_samples_leaf=50, max_features="auto", n_jobs=-1, random_state=0)
    rfShortCV.fit(cvX_train, cvy_train);
    print 'RF1 CV Results :',mean_absolute_error(cvy_test,rfShortCV.predict(cvX_test))
    pd.DataFrame({"Actual":cvy_test, "Predicted":rfShortCV.predict(cvX_test)}).to_csv("snehaRF.csv", index=False,header=True);
    rfShort.fit(X_train,y_train)
    cvClfs.append(rfShortCV)
    clfs.append(rfShort)
    pd.DataFrame({"ID":out_id, "Expected":rfShort.predict(X_test)}).to_csv("subRF1.csv", index=False,header=True);

    print "Building SVM"
    clfSVRCV = SVR(C=10.0)
    clfSVR = SVR(C=10.0)
    clfSVRCV.fit(cvX_train, cvy_train);
    print 'SVM CV Results :',mean_absolute_error(cvy_test,clfSVRCV.predict(cvX_test))
    pd.DataFrame({"Actual":cvy_test, "Predicted":clfSVRCV.predict(cvX_test)}).to_csv("snehaSVR.csv", index=False,header=True);

    print "Building RF2"
    rfLongCV = ensemble.RandomForestRegressor(min_samples_split=200,n_estimators=1000, max_depth=7, min_samples_leaf=200, max_features="auto", n_jobs=4, random_state=0)
    rfLong = ensemble.RandomForestRegressor(min_samples_split=200,n_estimators=1000, max_depth=7, min_samples_leaf=200, max_features="auto", n_jobs=4, random_state=0)
    rfLongCV.fit(cvX_train, cvy_train);
    print 'RF2 CV Results :',mean_absolute_error(cvy_test,rfLongCV.predict(cvX_test))
    rfLong.fit(X_train,y_train)
    cvClfs.append(rfLongCV)
    clfs.append(rfLong)
    pd.DataFrame({"ID":out_id, "Expected":rfLong.predict(X_test)}).to_csv("subRF2.csv", index=False,header=True);


    print "Building GB1"
    regGBCV1 = ensemble.GradientBoostingRegressor(min_samples_split=50,n_estimators=1000, max_depth=None, min_samples_leaf=50, max_features="auto", subsample=0.6, learning_rate=0.01, random_state=0,loss='lad')
    regGBCV1.fit(cvX_train, cvy_train);
    print 'GB1 CV Results :',mean_absolute_error(cvy_test,regGBCV1.predict(cvX_test))
    regGB1 = ensemble.GradientBoostingRegressor(min_samples_split=50,n_estimators=1000, max_depth=None, min_samples_leaf=50, max_features="auto", subsample=0.6, learning_rate=0.01, random_state=0,loss='lad')
    regGB1.fit(X_train,y_train)
    cvClfs.append(regGBCV1)
    clfs.append(regGB1)
    pd.DataFrame({"ID":out_id, "Expected":regGB1.predict(X_test)}).to_csv("subGB1.csv", index=False,header=True);


    print 'Building GB2'
    regGBCV2 = ensemble.GradientBoostingRegressor(min_samples_split=50,n_estimators=1000, max_depth=7, min_samples_leaf=200, max_features="auto", subsample=0.6, learning_rate=0.01, random_state=0,loss='lad')
    regGBCV2.fit(cvX_train, cvy_train);
    print 'GB2 CV Results :',mean_absolute_error(cvy_test,regGBCV2.predict(cvX_test))
    regGB2 = ensemble.GradientBoostingRegressor(min_samples_split=50,n_estimators=1000, max_depth=7, min_samples_leaf=200, max_features="auto", subsample=0.6, learning_rate=0.01, random_state=0,loss='lad')
    regGB2.fit(X_train,y_train)
    cvClfs.append(regGBCV2)
    clfs.append(regGB2)
    pd.DataFrame({"ID":out_id, "Expected":regGB2.predict(X_test)}).to_csv("subGB2.csv", index=False,header=True);


    print 'Feature Importances RF1:',sorted(zip(map(lambda x: round(x, 4), rfShort.feature_importances_), df_final.columns),reverse=True);
    print 'Feature Importances GB1:',sorted(zip(map(lambda x: round(x, 4), regGB1.feature_importances_), df_final.columns),reverse=True);
    print 'Feature Importances RF2:',sorted(zip(map(lambda x: round(x, 4), rfLong.feature_importances_), df_final.columns),reverse=True);
    print 'Feature Importances GB2:',sorted(zip(map(lambda x: round(x, 4), regGB2.feature_importances_), df_final.columns),reverse=True);

    print "Building XGB1"
    xgbCV1 = xgb.XGBRegressor(n_estimators=3000, nthread=-1, max_depth=None,
                        learning_rate=0.01, silent=True, subsample=0.8, colsample_bytree=0.7)
    xgbCV1.fit(cvX_train, cvy_train);
    xgb1 = xgb.XGBRegressor(n_estimators=3000, nthread=-1, max_depth=None,
                        learning_rate=0.01, silent=True, subsample=0.8, colsample_bytree=0.7)
    xgb1.fit(X_train,y_train);
    print 'XGB1 Model CV :',mean_absolute_error(cvy_test,xgbCV1.predict(cvX_test));
    cvClfs.append(xgbCV1)
    clfs.append(xgb1)
    pd.DataFrame({"ID":out_id, "Expected":xgb1.predict(X_test)}).to_csv("subXGB1.csv", index=False,header=True);



    print "Building XGB2"
    params = {}
    params["objective"] = "reg:linear"
    params["learning_rate"] = 0.005
    params["min_child_weight"] = 6
    params["subsample"] = 0.7
    params["colsample_bytree"] = 0.75
    params["silent"] = 1
    params["max_depth"] = 7
    params["n_estimators"] = 3000
    params['gamma'] = 1.25
    params['nthread'] = -1
    print 'XGBoost Training Process Started'
    xgbCV2 = XGBRegressor(**params);
    xgbCV2.fit(cvX_train, cvy_train);
    print 'XGB Model CV :',mean_absolute_error(cvy_test,xgbCV2.predict(cvX_test));
    xgb2 = XGBRegressor(**params);
    xgb2.fit(X_train,y_train);
    cvClfs.append(xgbCV2)
    clfs.append(xgb2)
    pd.DataFrame({"ID":out_id, "Expected":xgb2.predict(X_test)}).to_csv("subXGB2.csv", index=False,header=True);


    # Return the cross validated models and the actual fitted models separately.
    return [clfs,cvClfs];
示例#43
0
start = time.time()
if (gridsearch & sample): # only do gridsearch if we run with sampled data.
    print "Attempting GridSearchCV for XGB model"
    gscv = GridSearchCV(regressor, {
        'max_depth': [3, 5, 7, 11, 13, 17, 23],
        'n_estimators': [32, 64, 128, 512, 1024, 2048, 4096],
        'learning_rate': [0.15],
        'subsample': [0.6,0.7,0.8],
        'colsample_bytree': [0.6,0.7,0.8]},
        verbose=1, n_jobs=2)
    regressor = gscv.fit(np.array(train), train[goal])
    print(regressor.best_score_)
    print(regressor.best_params_)
else:
    regressor.fit(np.array(train[features]), train[goal])
print '  -> Training time:', time.time() - start

# Evaluation and export result
if sample:
    if not gridsearch:
        # Test results
        if logexp:
            print "RMSPE: " + str(rmspe(map(lambda x : np.exp(x)-1, regressor.predict(np.array(test[features]))),test[goal].values))
        else:
            print "RMSPE: " + str(rmspe(regressor.predict(np.array(test[features])),test[goal].values))
else:
    csvfile = 'result/' + regressor.__class__.__name__ + '-submit.csv'
    with open(csvfile, 'w') as output:
        predictions = []
        for i in test[myid].tolist():
class PrudentialRegressorFO(BaseEstimator, RegressorMixin):
    def __init__(self,
                objective='reg:linear',
                learning_rate=0.045,
                min_child_weight=50,
                subsample=0.8,
                colsample_bytree=0.7,
                max_depth=7,
                n_estimators=700,
                nthread=-1,
                seed=0,
                n_buckets=8,
                initial_params=[-1.5, -2.6, -3.6, -1.2, -0.8, 0.04, 0.7, 3.6,
                                #1., 2., 3., 4., 5., 6., 7.
                                ],
                minimizer='BFGS',
                scoring=NegQWKappaScorer):

        self.objective = objective
        self.learning_rate = learning_rate
        self.min_child_weight = min_child_weight
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.max_depth = max_depth
        self.n_estimators = n_estimators
        self.nthread = nthread
        self.seed = seed
        self.n_buckets = n_buckets
        self.initial_params = initial_params
        self.minimizer = minimizer
        self.scoring = scoring

        return


    def fit(self, X, y):
        from xgboost import XGBRegressor
        if not KAGGLE:
            from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor

        self.xgb = XGBRegressor(
                       objective=self.objective,
                       learning_rate=self.learning_rate,
                       min_child_weight=self.min_child_weight,
                       subsample=self.subsample,
                       colsample_bytree=self.colsample_bytree,
                       max_depth=self.max_depth,
                       n_estimators=self.n_estimators,
                       nthread=self.nthread,
                       missing=0.0,
                       seed=self.seed)
        from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor
        self.off = FullDigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets,
#                       basinhopping=True,
                       initial_params=self.initial_params,
                       minimizer=self.minimizer,
                       scoring=self.scoring)

        self.xgb.fit(X, y)

        tr_y_hat = self.xgb.predict(X,
                                    ntree_limit=self.xgb.booster().best_iteration)
        print('Train score is:', -self.scoring(tr_y_hat, y))
        self.off.fit(tr_y_hat, y)
        print("Offsets:", self.off.params)

        return self


    def predict(self, X):
        from numpy import clip
        te_y_hat = self.xgb.predict(X, ntree_limit=self.xgb.booster().best_iteration)
        return clip(self.off.predict(te_y_hat), 1, 8)

    pass
示例#45
0
kfold = StratifiedKFold(y, n_folds=3, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid, scoring="mean_absolute_error", n_jobs=-1, cv=kfold)


# In[210]:

result = grid_search.fit(train,y)
# summarize results
print("Best: %f using %s" % (result.best_score_, result.best_params_))


# In[211]:

model=XGBRegressor(learning_rate=0.3,n_estimators=100)
for traincv,testcv in kfold:
    model.fit(train.iloc[traincv],y.iloc[testcv])



# In[212]:

y_pred=model.predict(test)


# In[213]:

output2 = pd.DataFrame( data={"outlet_no":outlet,"total_sales_Actual": y_pred} )
output2.to_csv("model.csv", index=False,quoting=3)


# In[ ]:
# In[41]:

#Build the model
#model=ExtraTreesRegressor()
#model=RandomForestRegressor()
#params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
#                  'learning_rate': 0.01, 'loss': 'ls'}
params = {'n_estimators': 400, 'max_depth': 7}
#model=GradientBoostingRegressor(**params)
model=XGBRegressor(**params)
#model=GaussianNB()
#model=Ridge()
#model=KNeighborsRegressor()
#model=DecisionTreeRegressor()
model.fit(train_dataset,train_target)

#Predict with the model
predictions=model.predict(test_dataset)


# In[51]:

### Cross Validation ###

#cv = StratifiedKFold(train_dataset, n_folds=5)

###scoring
scores = cross_validation.cross_val_score(model, train_dataset, train_target, cv=5)
print "Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)
X_train = train_df.ix[:, train_df.columns != 'price_doc'].values
X_test = test_df.values

################################## XGBRegressor ###############################

#Initialize Model
xgb = XGBRegressor()

#Create cross-validation
cv = TimeSeriesSplit(n_splits=5)
#Train & Test Model
cross_val_results = cross_val_score(xgb, X_train, Y_train, cv=cv, scoring='neg_mean_squared_error')
print(cross_val_results.mean())


model = xgb.fit(X_train, Y_train)
# model.feature_importances_;

from xgboost import XGBRegressor

#Get Data
Y_train = train_df['price_doc'].values
X_train = train_df.ix[:, train_df.columns != 'price_doc'].values
X_test = test_df.values
#Init Model
xgb = XGBRegressor(learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.7)
#Train Model
model = xgb.fit(X_train, Y_train)
#Make Predictions
predictions = xgb.predict(X_test)
示例#48
0
def Model(train_linear, test_linear):
    train_linear_fea=train_linear.drop(columns=['SalePrice'])
    train_linear_tar=train_linear.SalePrice
    x_train, x_test, y_train, y_test = train_test_split(train_linear_fea, train_linear_tar,test_size=0.2, random_state=0)
    def evaluate(model, test_features, test_labels,train_features, train_labels):
        predictions = model.predict(test_features)
        errors = abs(predictions - test_labels)
        mape = 100 * np.mean(errors / test_labels)
        accuracy = 100 - mape
        print('Model Performance')
        print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
        print('Accuracy = {:0.2f}%.'.format(accuracy))    
        print("MSE for train data is: %f" % mean_squared_error(y_train, model.predict(x_train)))
        print("MSE for validation data is: %f" % mean_squared_error(y_test, model.predict(x_test)))
        return accuracy
    real_train_tar=np.expm1(train_linear_tar)
    """
        . Lasso model
    """
    
    lassocv = LassoCV(alphas = np.logspace(-5, 4, 400), )
    lassocv.fit(train_linear_fea, train_linear_tar)
    lassocv_score = lassocv.score(train_linear_fea, train_linear_tar)
    lassocv_alpha = lassocv.alpha_
    print("Best alpha : ", lassocv_alpha, "Score: ",lassocv_score)
    
    start=time.time()
    lasso =Lasso(normalize = True)
    lasso.set_params(alpha=lassocv_alpha,max_iter = 10000)
    lasso.fit(x_train, y_train)
    end=time.time()
    mean_squared_error(y_test, lasso.predict(x_test))
    coef_lasso=pd.Series(lassocv.coef_, index=x_train.columns).sort_values(ascending =False)
    evaluate(lasso,x_test,y_test,x_train,y_train)
    print('Time elapsed: %.4f seconds' % (end-start))
    
    y_lasso_predict=lasso.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,np.expm1(y_lasso_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
    
    test_prediction_lasso=np.expm1(lasso.predict(test_linear))
    
    
    """
        . Ridge model
    """
    
    ridgecv = RidgeCV(alphas = np.logspace(-5, 4, 400))
    ridgecv.fit(x_train, y_train)
    ridgecv_score = ridgecv.score(x_train, y_train)
    ridgecv_alpha = ridgecv.alpha_
    print("Best alpha : ", ridgecv_alpha, "Score: ",ridgecv_score)
    coef=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False)
    
    start=time.time()
    ridge =Ridge(normalize = True)
    ridge.set_params(alpha=ridgecv_alpha,max_iter = 10000)
    ridge.fit(x_train, y_train)
    end=time.time()
    mean_squared_error(y_test, ridge.predict(x_test))
    coef_ridge=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False)
    evaluate(ridge,x_test,y_test,x_train,y_train)
    print('Time elapsed: %.4f seconds' % (end-start))
    
    y_ridge_predict=ridge.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,np.expm1(y_ridge_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
    
    test_prediction_ridge=np.expm1(ridge.predict(test_linear))
    
    
    """
        . Random Forest
    """
    #train=train.drop(columns=['DateSold'])
    #test=test.drop(columns=['DateSold'])
    #X_train=train.drop(columns=['SalePrice'])
    #Y_train=train['SalePrice']
    X_train=train_linear_fea
    Y_train=train_linear_tar
    x_train_rf, x_test_rf, y_train_rf, y_test_rf = train_test_split(X_train, Y_train,test_size=0.2, random_state=0)
    
    
    n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)]
    max_features = ['auto', 'sqrt']
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    min_samples_split = [2, 5, 10]
    min_samples_leaf = [1, 2, 4]
    bootstrap = [True, False]
    random_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap}
    
    rf = RandomForestRegressor()
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    #
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
    rf_random.fit(X_train, Y_train)
    #rf_random.fit(x_train_rf, y_train_rf)
    rf_random.best_params_
    
    #Random search allowed us to narrow down the range for each hyperparameter. Now that we know where to concentrate our search,
    # we can explicitly specify every combination of settings to try. 
    param_grid = {
        'bootstrap': [False],
        'max_depth': [80, 90, 100, 110,120,130],
        'max_features': [2, 3],
        'min_samples_leaf': [1,2,3, 4],
        'min_samples_split': [2,4,6,8, 10, 12],
        'n_estimators': [600,700, 800, 900, 1000]
    }
    # Create a based model
    rf = RandomForestRegressor()
    # Instantiate the grid search model
    grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2)
    #grid_search.fit(x_train, y_train)
    grid_search.fit(X_train, Y_train)
    grid_search.best_params_
    
    best_random = grid_search.best_estimator_
    start=time.time()
    best_random.fit(x_train_rf,y_train_rf)
    end=time.time()
    evaluate(best_random, x_test_rf, y_test_rf,x_train_rf,y_train_rf)
    print('Time elapsed: %.4f seconds' % (end-start))
    
    y_rf_predict=best_random.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,np.expm1(y_rf_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
    importance_rf = pd.DataFrame({'features':train_linear_fea.columns, 'imp':best_random.feature_importances_}).\
                            sort_values('imp',ascending=False)
    
    importance_top20_rf = importance_rf.iloc[:20,]
    
    plt.barh(importance_top20_rf.features, importance_top20_rf.imp)
    plt.xlabel('Feature Importance')
    
    test_prediction_rf=np.expm1(best_random.predict(test_linear))
    
    """
        . Xgboost
    """
    
    learning_rate = [round(float(x), 2) for x in np.linspace(start = .1, stop = .2, num = 11)]
        # Minimum for sum of weights for observations in a node
    min_child_weight = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
        # Maximum nodes in each tree
    max_depth = [int(x) for x in np.linspace(1, 10, num = 10)]
    n_estimators=[int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)]
    subsample=[0.3, 0.4,0.5,0.6, 0.7]
    model = xgb.XGBRegressor()
    random_grid = {'learning_rate': learning_rate,
                    'max_depth': max_depth,
                    'min_child_weight': min_child_weight,
                    'subsample': subsample,
                    'n_estimators':n_estimators
                    }
    
        # Make a RandomizedSearchCV object with correct model and specified hyperparams
    xgb_random = RandomizedSearchCV(estimator=model, param_distributions=random_grid, n_iter=1000, cv=5, verbose=2, random_state=42, n_jobs=-1)
    start = time.time()
        # Fit models
    xgb_random.fit(X_train, Y_train)
    xgb_random.best_params_
    
    
    """
    best_params_={'learning_rate': 0.1,
     'max_depth': 2,
     'min_child_weight': 4,
     'n_estimators': 900,
     'subsample': 0.5}
    """
    model_xgb = XGBRegressor(**xgb_random.best_params_)
    #model_xgb = XGBRegressor(**best_params_)
    start=time.time()
    model_xgb.fit(x_train_rf,y_train_rf)
    end=time.time()
    evaluate(model_xgb, x_test_rf, y_test_rf,x_train_rf,y_train_rf)
    print('Time elapsed: %.4f seconds' % (end-start))
    
    
    
    y_xgb_predict=model_xgb.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,np.expm1(y_xgb_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
    importance_xgb = pd.DataFrame({'features':train_linear_fea.columns, 'imp':model_xgb.feature_importances_}).\
                            sort_values('imp',ascending=False)
    
    importance_top20_xgb = importance_xgb.iloc[:20,]
    
    plt.barh(importance_top20_xgb.features, importance_top20_xgb.imp)
    plt.xlabel('Feature Importance')
    
    test_prediction_xgb=np.expm1(model_xgb.predict(test_linear))
    
    return(test_prediction_lasso, test_prediction_ridge, test_prediction_rf, test_prediction_xgb,y_lasso_predict, y_ridge_predict, y_rf_predict, y_xgb_predict)
示例#49
-1
def XgBoost(train_linear, test_linear):
    learning_rate = [round(float(x), 2) for x in np.linspace(start = .1, stop = .2, num = 11)]
        # Minimum for sum of weights for observations in a node
    min_child_weight = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
        # Maximum nodes in each tree
    max_depth = [int(x) for x in np.linspace(1, 10, num = 10)]
    n_estimators=[int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)]
    subsample=[0.3, 0.4,0.5,0.6, 0.7]
    model = xgb.XGBRegressor()
    random_grid = {'learning_rate': learning_rate,
                    'max_depth': max_depth,
                    'min_child_weight': min_child_weight,
                    'subsample': subsample,
                    'n_estimators':n_estimators
                    }
    
        # Make a RandomizedSearchCV object with correct model and specified hyperparams
    xgb_random = RandomizedSearchCV(estimator=model, param_distributions=random_grid, n_iter=1000, cv=5, verbose=2, random_state=42, n_jobs=-1)
    start = time.time()
        # Fit models
    xgb_random.fit(X_train, Y_train)
    xgb_random.best_params_
    
    from xgboost import XGBRegressor
    """
    best_params_={'learning_rate': 0.1,
     'max_depth': 2,
     'min_child_weight': 4,
     'n_estimators': 900,
     'subsample': 0.5}
    """
    model_xgb = XGBRegressor(**xgb_random.best_params_)
    #model_xgb = XGBRegressor(**best_params_)
    start=time.time()
    model_xgb.fit(x_train_rf,y_train_rf)
    end=time.time()
    evaluate(model_xgb, x_test_rf, y_test_rf,x_train_rf,y_train_rf)
    print('Time elapsed: %.4f seconds' % (end-start))
    
    
    
    y_xgb_predict=model_xgb.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,np.expm1(y_xgb_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
    importance_xgb = pd.DataFrame({'features':train_linear_fea.columns, 'imp':model_xgb.feature_importances_}).\
                            sort_values('imp',ascending=False)
    importance_xgb=importance_xgb[importance_xgb['features']!='Id']
    
    importance_top20_xgb = importance_xgb.iloc[:20,]
    
    plt.barh(importance_top20_xgb.features, importance_top20_xgb.imp)
    plt.xlabel('Feature Importance')
    
    test_prediction_xgb=np.expm1(model_xgb.predict(test_linear))
    write_pkl(xgb_random.best_params_, '/Users/vickywinter/Documents/NYC/Machine Learning Proj/Pickle/xgb_params.pkl')
    return test_prediction_xgb