def XGB(self, x_train, y_train, x_test, y_test):
     x_train, y_train = shuffle(x_train, y_train)
     xgb = XGBRegressor(max_depth=4, subsample=0.9)
     xgb.fit(x_train,y_train)
     y_pred = xgb.predict(x_test).reshape(x_test.shape[0], 1)
     loss = mean_squared_error(y_pred, y_test)
     print loss
     return y_pred, loss
Exemplo n.º 2
0
def Stacking(real_train_tar):
    predictions_train = pd.DataFrame([np.expm1(y_lasso_predict), np.expm1(y_ridge_predict), np.expm1(y_rf_predict), np.expm1(y_xgb_predict)]).T
    sns.pairplot(predictions_train)
    
    learning_rate = [round(float(x), 2) for x in np.linspace(start = .1, stop = .2, num = 11)]
        # Minimum for sum of weights for observations in a node
    min_child_weight = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
        # Maximum nodes in each tree
    max_depth = [int(x) for x in np.linspace(1, 10, num = 10)]
    n_estimators=[int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)]
    subsample=[0.3, 0.4,0.5,0.6, 0.7]
    stack_model = xgb.XGBRegressor()
    random_grid = {'learning_rate': learning_rate,
                    'max_depth': max_depth,
                    'min_child_weight': min_child_weight,
                    'subsample': subsample,
                    'n_estimators':n_estimators
                    }
    
        # Make a RandomizedSearchCV object with correct model and specified hyperparams
    xgb_stack = RandomizedSearchCV(estimator=stack_model, param_distributions=random_grid, n_iter=1000, cv=5, verbose=2, random_state=42, n_jobs=-1)
    start = time.time()
        # Fit models
    xgb_stack.fit(predictions_train, real_train_tar)
    xgb_stack.best_params_
    write_pkl(xgb_stack.best_params_, '/Users/vickywinter/Documents/NYC/Machine Learning Proj/Pickle/stack_params.pkl')
    
    model_stacking = XGBRegressor(**xgb_stack.best_params_)
    #model_xgb = XGBRegressor(**best_params_)
    start=time.time()
    model_stacking.fit(predictions_train,real_train_tar)
    end=time.time()
    print("MSE for train data is: %f" % mean_squared_error(np.log1p(real_train_tar),np.log1p( model_stacking.predict(predictions_train))))
    print('Time elapsed: %.4f seconds' % (end-start))
    
    
    y_stack_predict=model_stacking.predict(predictions_train)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,y_stack_predict)
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
Exemplo n.º 3
0
class HousePricePredictor(BaseModel):
    def __init__(self):
        self.model = XGBRegressor()

    def predict(self, X):
        X = self._prepare_data(X)
        return self.model.predict(X)

    def _prepare_data(self, X):
        return pd.DataFrame(X, columns=FEATURES)

    def fit(self, X, y):
        model = XGBRegressor()
        clf = GridSearchCV(
            model,
            {
                'max_depth': [6, ],
                'learning_rate': [0.05, ],
                'n_estimators': [450, 470, 475, 480, 485, ]
            },
            n_jobs=4,
            cv=3,
            verbose=1
        )
        clf.fit(X, y)
        logging.info("Best Score: {}".format(clf.best_score_))
        logging.info("Best Params: {}".format(clf.best_params_))
        self.model = clf.best_estimator_

        return self.model

    def dump(self, path):
        self.model.save_model(path)

    @classmethod
    def load(cls, path):
        house_model = HousePricePredictor()
        house_model.model.load_model(path)

        return house_model
Exemplo n.º 4
0
test = test.drop(["单核细胞%"], axis=1)
# X = X.drop(["淋巴细胞%"], axis=1)
# X = X.drop(["乙肝e抗原"], axis=1)
# X = X.drop(["乙肝表面抗体"], axis=1)
Y = data["血糖"]
Y = np.log1p(Y)

clf = XGBRegressor()
print("---111----")
kfold = KFold(n_splits=5, random_state=7)
test_score = np.sqrt(
    -cross_val_score(clf, X, Y, cv=kfold, scoring='neg_mean_squared_error'))
print("------test_score--------")
print(test_score)
print(np.mean(test_score))
print("---2----")
clf.fit(X, Y)
FeatureImportances = pd.Series(
    clf.booster().get_fscore()).sort_values(ascending=False)
print(FeatureImportances)
print("---3----")
pred = np.expm1(clf.predict(test))
pred_df = pd.DataFrame()
pred_df["pred"] = pred

pred_df.to_csv(
    '/Users/jianjun.yue/PycharmGItHub/data/人工智能辅助糖尿病遗传风险预测/sub_0107_XG_去掉负效果特征.csv',
    header=False,
    index=False,
    float_format='%.3f')
Exemplo n.º 5
0
print('RMSE KNeighborsRegressor: ', RMSLE(np.log1p(train['visitors'].values),
                                          model2.predict(train[col])))
#test['visitors'] = (model1.predict(test[col]) + model2.predict(test[col])) / 2
test['visitors'] = model2.predict(test[col])
test['visitors'] = np.expm1(test['visitors']).clip(lower=0.)
sub1 = test[['id','visitors']].copy()
#del train; del data;

sub1[['id', 'visitors']].to_csv(os.path.join(path_kaggle, 'naive_forecast2.csv'),
                                index = False)

from xgboost import XGBRegressor
model3 = XGBRegressor()
model3.fit(train[col], np.log1p(train['visitors'].values), verbose=False)
print('XGBRegressor: ', RMSLE(np.log1p(train['visitors'].values),
                              model3.predict(train[col])))

## from hklee
## https://www.kaggle.com/zeemeen/weighted-mean-comparisons-lb-0-497-1st/code
#dfs = { re.search('/([^/\.]*)\.csv', fn).group(1):
#    pd.read_csv(fn)for fn in glob.glob('../input/*.csv')}
#
#for k, v in dfs.items(): locals()[k] = v
#
#wkend_holidays = date_info.apply(
#    (lambda x:(x.day_of_week=='Sunday' or x.day_of_week=='Saturday') and x.holiday_flg==1), axis=1)
#date_info.loc[wkend_holidays, 'holiday_flg'] = 0
#date_info['weight'] = ((date_info.index + 1) / len(date_info)) ** 5  
#
#visit_data = air_visit_data.merge(date_info, left_on='visit_date', right_on='calendar_date', how='left')
#visit_data.drop('calendar_date', axis=1, inplace=True)
Exemplo n.º 6
0
	mea = getmea(max_leaf_nodes,train_x,val_x,train_y,val_y)
	print("Max_leaf_nodes: %d ,mea: %d" %(max_leaf_nodes,mea))

'''
# clf = XGBRegressor() 17165
# XGBRegressor(n_estimators=400)  16330
'''
params = [.02,.03,.04,.05,.06,.07,.08,.09,.10]#[1:1001:50][100,200,300,400,500]
test_scores = []
for param in params:
    clf = XGBRegressor(n_estimators=400,learning_rate=param)
    test_score = np.sqrt(-cross_val_score(clf, train_X, train_y, cv=10, scoring='neg_mean_squared_error'))
    test_scores.append(np.mean(test_score))
plt.plot(params, test_scores)
plt.title("n_estimator vs CV Error" + str(params));
# 一定要加上这句才能让画好的图显示在屏幕上
plt.show()
'''

my_model = XGBRegressor(n_estimators=400)
my_model.fit(train_X, train_y,verbose=False)
predictions = my_model.predict(test_X)
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_y)))

#save model
#joblib.dump(melbourne_model,'model.pickle')

#load model
#model = joblib.load('model.pickle')

def ValidateTrainTestErrorsWithDifferentModels(cvX_train, cvX_test, cvy_train, cvy_test,X_train,y_train,X_test):
    clfs = list()
    cvClfs = list()

    print "Building RF1"
    rfShortCV = ensemble.RandomForestRegressor(min_samples_split=50,n_estimators=1000, max_depth=None, min_samples_leaf=50, max_features="auto", n_jobs=-1, random_state=0)
    rfShort = ensemble.RandomForestRegressor(min_samples_split=50,n_estimators=1000, max_depth=None, min_samples_leaf=50, max_features="auto", n_jobs=-1, random_state=0)
    rfShortCV.fit(cvX_train, cvy_train);
    print 'RF1 CV Results :',mean_absolute_error(cvy_test,rfShortCV.predict(cvX_test))
    pd.DataFrame({"Actual":cvy_test, "Predicted":rfShortCV.predict(cvX_test)}).to_csv("snehaRF.csv", index=False,header=True);
    rfShort.fit(X_train,y_train)
    cvClfs.append(rfShortCV)
    clfs.append(rfShort)
    pd.DataFrame({"ID":out_id, "Expected":rfShort.predict(X_test)}).to_csv("subRF1.csv", index=False,header=True);

    print "Building SVM"
    clfSVRCV = SVR(C=10.0)
    clfSVR = SVR(C=10.0)
    clfSVRCV.fit(cvX_train, cvy_train);
    print 'SVM CV Results :',mean_absolute_error(cvy_test,clfSVRCV.predict(cvX_test))
    pd.DataFrame({"Actual":cvy_test, "Predicted":clfSVRCV.predict(cvX_test)}).to_csv("snehaSVR.csv", index=False,header=True);

    print "Building RF2"
    rfLongCV = ensemble.RandomForestRegressor(min_samples_split=200,n_estimators=1000, max_depth=7, min_samples_leaf=200, max_features="auto", n_jobs=4, random_state=0)
    rfLong = ensemble.RandomForestRegressor(min_samples_split=200,n_estimators=1000, max_depth=7, min_samples_leaf=200, max_features="auto", n_jobs=4, random_state=0)
    rfLongCV.fit(cvX_train, cvy_train);
    print 'RF2 CV Results :',mean_absolute_error(cvy_test,rfLongCV.predict(cvX_test))
    rfLong.fit(X_train,y_train)
    cvClfs.append(rfLongCV)
    clfs.append(rfLong)
    pd.DataFrame({"ID":out_id, "Expected":rfLong.predict(X_test)}).to_csv("subRF2.csv", index=False,header=True);


    print "Building GB1"
    regGBCV1 = ensemble.GradientBoostingRegressor(min_samples_split=50,n_estimators=1000, max_depth=None, min_samples_leaf=50, max_features="auto", subsample=0.6, learning_rate=0.01, random_state=0,loss='lad')
    regGBCV1.fit(cvX_train, cvy_train);
    print 'GB1 CV Results :',mean_absolute_error(cvy_test,regGBCV1.predict(cvX_test))
    regGB1 = ensemble.GradientBoostingRegressor(min_samples_split=50,n_estimators=1000, max_depth=None, min_samples_leaf=50, max_features="auto", subsample=0.6, learning_rate=0.01, random_state=0,loss='lad')
    regGB1.fit(X_train,y_train)
    cvClfs.append(regGBCV1)
    clfs.append(regGB1)
    pd.DataFrame({"ID":out_id, "Expected":regGB1.predict(X_test)}).to_csv("subGB1.csv", index=False,header=True);


    print 'Building GB2'
    regGBCV2 = ensemble.GradientBoostingRegressor(min_samples_split=50,n_estimators=1000, max_depth=7, min_samples_leaf=200, max_features="auto", subsample=0.6, learning_rate=0.01, random_state=0,loss='lad')
    regGBCV2.fit(cvX_train, cvy_train);
    print 'GB2 CV Results :',mean_absolute_error(cvy_test,regGBCV2.predict(cvX_test))
    regGB2 = ensemble.GradientBoostingRegressor(min_samples_split=50,n_estimators=1000, max_depth=7, min_samples_leaf=200, max_features="auto", subsample=0.6, learning_rate=0.01, random_state=0,loss='lad')
    regGB2.fit(X_train,y_train)
    cvClfs.append(regGBCV2)
    clfs.append(regGB2)
    pd.DataFrame({"ID":out_id, "Expected":regGB2.predict(X_test)}).to_csv("subGB2.csv", index=False,header=True);


    print 'Feature Importances RF1:',sorted(zip(map(lambda x: round(x, 4), rfShort.feature_importances_), df_final.columns),reverse=True);
    print 'Feature Importances GB1:',sorted(zip(map(lambda x: round(x, 4), regGB1.feature_importances_), df_final.columns),reverse=True);
    print 'Feature Importances RF2:',sorted(zip(map(lambda x: round(x, 4), rfLong.feature_importances_), df_final.columns),reverse=True);
    print 'Feature Importances GB2:',sorted(zip(map(lambda x: round(x, 4), regGB2.feature_importances_), df_final.columns),reverse=True);

    print "Building XGB1"
    xgbCV1 = xgb.XGBRegressor(n_estimators=3000, nthread=-1, max_depth=None,
                        learning_rate=0.01, silent=True, subsample=0.8, colsample_bytree=0.7)
    xgbCV1.fit(cvX_train, cvy_train);
    xgb1 = xgb.XGBRegressor(n_estimators=3000, nthread=-1, max_depth=None,
                        learning_rate=0.01, silent=True, subsample=0.8, colsample_bytree=0.7)
    xgb1.fit(X_train,y_train);
    print 'XGB1 Model CV :',mean_absolute_error(cvy_test,xgbCV1.predict(cvX_test));
    cvClfs.append(xgbCV1)
    clfs.append(xgb1)
    pd.DataFrame({"ID":out_id, "Expected":xgb1.predict(X_test)}).to_csv("subXGB1.csv", index=False,header=True);



    print "Building XGB2"
    params = {}
    params["objective"] = "reg:linear"
    params["learning_rate"] = 0.005
    params["min_child_weight"] = 6
    params["subsample"] = 0.7
    params["colsample_bytree"] = 0.75
    params["silent"] = 1
    params["max_depth"] = 7
    params["n_estimators"] = 3000
    params['gamma'] = 1.25
    params['nthread'] = -1
    print 'XGBoost Training Process Started'
    xgbCV2 = XGBRegressor(**params);
    xgbCV2.fit(cvX_train, cvy_train);
    print 'XGB Model CV :',mean_absolute_error(cvy_test,xgbCV2.predict(cvX_test));
    xgb2 = XGBRegressor(**params);
    xgb2.fit(X_train,y_train);
    cvClfs.append(xgbCV2)
    clfs.append(xgb2)
    pd.DataFrame({"ID":out_id, "Expected":xgb2.predict(X_test)}).to_csv("subXGB2.csv", index=False,header=True);


    # Return the cross validated models and the actual fitted models separately.
    return [clfs,cvClfs];
for param in params:
    clf = XGBRegressor(n_estimators=param)
    test_score = np.sqrt(-cross_val_score(clf, train_x, train_y, cv=10, scoring='neg_mean_squared_error'))
    test_scores.append(np.mean(test_score))
print test_scores
plt.plot(params, test_scores)
plt.title("n_estimators vs CV Error");
# 一定要加上这句才能让画好的图显示在屏幕上
plt.show()
# 将当前figure的图保存到文件result.png
#plt.savefig('./xgboostparams.png')

# 91 16889
xgb = XGBRegressor(max_depth=6,n_estimators=400)
xgb.fit(X, y)
print mean_absolute_error(val_y,xgb.predict(val_x))
print(mean_squared_error(val_y,xgb.predict(val_x)))

#gbdt
'''
print "GradientBoostingRegressor"    
gbdt = GradientBoostingRegressor(n_estimators = 1000,max_leaf_nodes = 400)
gbdt.fit(X, y)#17083
#RandomForestRegressor 93  16938
#GradientBoostingRegressor 90 16866
#XGBRegressor 100 19939 
print mean_absolute_error(val_y,gbdt.predict(val_x))
print(mean_squared_error(val_y,gbdt.predict(val_x)))

# predict and save output
#print ("The predictions are")
Exemplo n.º 9
0
test_size=.3

X_train, X_test, y_train, y_test = train_test_split(X, log_loss, test_size=test_size, random_state=seed)


model=XGBRegressor(learning_rate=0.08,
                   max_depth=10,
                   objective='reg:linear',
                   nthread=3,
                   gamma=0.2,
                   subsample=0.9,
                   n_estimators=100,
                   )
model.fit(X_train, y_train)
print(model)
y_pred=model.predict(X_test)

def mae(predicted, actual, logscale=False):
    if logscale == True:
        predexp=np.exp(predicted)
        actualexp=np.exp(actual)
        return np.mean(np.abs(predexp - actualexp))
    else:
        return np.mean(np.abs(predicted - actual))

print(mae(y_pred, y_test, True))


# #Plotting Variable Importance
# plt.bar(range(len(model.feature_importances_)), model.feature_importances_)
# plt.title('Variable Importance')
Exemplo n.º 10
0
print(X.head())
#%%
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0)

my_model = XGBRegressor()
#%%
my_model.fit(train_X,
             train_y,
             early_stopping_rounds=5,
             eval_set=[(val_X, val_y)],
             verbose=False)
#%%
print("Making predictions for the following 5 situations:")
print(X.head())
print("The predictions are")
print(my_model.predict(X.head()))

#%%

predictions = my_model.predict(val_X)
#print (predictions)

#%%

#print((mean_squared_error(val_y, )

rmse = np.sqrt(mean_squared_error(val_y, predictions))
R = r2_score(val_y, predictions)
MAPE = np.mean(np.abs((val_y - predictions) / val_y)) * 100

print("RMSE: %f" % (rmse))
#           alpha = 1,
#           gamma = 2,
#           min_child_weight = 1,
#           base_score = 7.76
#           nrounds=5000,
#           nfold=5,
#           early_stopping_rounds=15,
#           print_every_n = 10,
#           verbose= 1,
#           feval=xg_eval_mae,
#           maximize=FALSE
#           )

    folds = KFold(n_splits=3, shuffle=False)
    for k, (train_index, test_index) in enumerate(folds.split(train_xg_x)):
        xtr = train_xg_x[train_index]
        ytr = train_xg_y[train_index]
        xtest = train_xg_x[test_index]
        ytest = train_xg_y[test_index]
        print "Fitting on fold {}...".format(k)
        print "Checking xtest shape: ", xtest.shape
        print "Checking ytest shape: ", ytest.shape
        xgboosting.fit(xtr, ytr, verbose=True)
        np.savetxt('xgb_pred_fold_{}.txt'.format(k), np.exp(xgboosting.predict(xtest)))
        np.savetxt('xgb_test_fold_{}.txt'.format(k), ytest)

    # Training xgboost on test set (i.e. whole train set).
    xgboosting.fit(train_xg_x, train_xg_y, verbose=True)
    print "Fitting on test set..."
    np.savetxt('xgb_pred_test.txt', np.exp(xgboosting.predict(test_xg_x)))
Exemplo n.º 12
0
ERR=0
min_err=1

for j in [15000]:
    # m=pd.read_csv('./data/result/6_0.0322.csv',names=['vid','收缩压', '舒张压', '血清甘油三酯', '血清高密度脂蛋白','血清低密度脂蛋白'])




############################################

    # clf = LGBMRegressor(n_estimators=100, subsample_for_bin=j,learning_rate=0.08,num_leaves=46,subsample=0.97,min_split_gain=3)
    clf = XGBRegressor(n_estimators=200, max_depth=7, min_child_weight=1, gamma=0)
    for i in y_names:
        clf.fit(train_x, train_y[i])
        pred_ = clf.predict(test_x)
        a = sum((np.log(pred_ + 1) - np.log(test_y[i] + 1)) ** 2) / len(pred_)
        joblib.dump(clf, './data/model/{0}.model_{1}'.format(i, a))
        print(a)
        ERR += a
    print(ERR / 5.0)
#################################################
    # for i in y_names:
    #     if i=='血清甘油三酯':
    #         # clf=joblib.load('./data/model_select/血清甘油三酯.model_0.07747025561699392')
    #         pass
    #     elif i=='血清高密度脂蛋白':
    #         clf=joblib.load('./data/model/血清高密度脂蛋白.model_0.011323631892292683')
    #     elif i=='收缩压':
    #         clf=joblib.load('./data/model/收缩压.model_0.014185246052119297')
    #     elif i=='舒张压':
Exemplo n.º 13
0
y = array[:,len(names)-1]

# Split-out to test and validation sets
test_size = 0.4
# Keep time series order by shuffle=False
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=test_size, shuffle=False)

# Fit model
model = XGBRegressor(n_estimator=200, max_depth=2, learning_rate=0.1, 
    colsample_bylevel=0.8, n_jobs=-1)
model.fit(X_train, y_train)
print(model)

# Make predictions
predictions = model.predict(X_test)

# Evaluate predictions
evc = explained_variance_score(y_test, predictions)
print("Explained variance: %.2f%%" % (evc * 100.0))
mae = mean_absolute_error(y_test, predictions)
print("Mean absolute error: %.2f" % (mae)) 
rmse = sqrt(mean_squared_error(y_test, predictions))
print("RMSE: %.2f" % (rmse)) 
r2 = r2_score(y_test, predictions)
print("R2 coefficient of determination: %.2f" % (r2)) 

# For sampled population confidence interval = standard error * 1.96; https://en.wikipedia.org/wiki/Standard_error
# Here used 2 * std
error = std(predictions)*2
Exemplo n.º 14
0
model = XGBRegressor(n_estimators=1000, learning_rate=0.1)

model.fit(x_train,
          y_train,
          verbose=True,
          eval_metric=["logloss", "rmse"],
          eval_set=[(x_train, y_train), (x_test, y_test)],
          early_stopping_rounds=20)
#rmse, mae, logloss, error, auc
#error은 회기 모델 지표가 아니다
#eval metric을 두가지 이상으로 할때는 리스트 형식으로 쓴다.
result = model.evals_result()
print("eval's results :", result)

y_predict = model.predict(x_test)
r2 = r2_score(y_predict, y_test)
print("r2 Score : %.2f%%" % (r2 * 100.0))
print("r2 :", r2)
# Stopping. Best iteration:
# [28]    validation_0-rmse:0.06268       validation_1-rmse:0.28525
#validation 이 올라가기 시작하면서 끊겼다. loss랑 validation 중에 중요한것은  validation 이다.

import matplotlib.pyplot as plt

epochs = len(result['validation_0']['logloss'])
#우리가 하게 된 에포의 길이
x_axis = range(0, epochs)

thresholds = np.sort(model.feature_importances_)
#정렬 #중요도가 낮은 것부터 높은것 까지
Exemplo n.º 15
0
xgb1 = XGBRegressor(n_estimators=100,
                    subsample=0.9,
                    learning_rate=0.08,
                    reg_alpha=1e-05)
xgb1.fit(X_train, y_train)
print("---2----")
predict_df = pd.read_excel(
    '/Users/jianjun.yue/PycharmGItHub/data/智能制造质量预测/测试A.xlsx',
    header=0,
    encoding='utf-8')

# predict_df=predict_df.drop(["ID"],axis=1)
predict_df = predict_df[quantity]
# predict_df = Imputer().fit_transform(predict_df)
print("---3----")
pred = xgb1.predict(predict_df)
pred_df = pd.DataFrame()
pred_df["pred"] = pred
pred_df.to_csv(
    '/Users/jianjun.yue/PycharmGItHub/data/智能制造质量预测/测试A-答案模板_pred.csv',
    index=False,
    float_format='%.4f')
print("---4----")
submission_df = pd.DataFrame()

submission_iddf = pd.read_csv(
    '/Users/jianjun.yue/PycharmGItHub/data/智能制造质量预测/测试A-答案模板.csv')
pred_df_TEMP = pd.read_csv(
    '/Users/jianjun.yue/PycharmGItHub/data/智能制造质量预测/测试A-答案模板_pred.csv')
submission_df["id"] = submission_iddf["id"]
submission_df["pred"] = pred_df_TEMP["pred"]
test_X = data_test.copy()
numeric_cols = train_X.dtypes[train_X.dtypes != 'object'].index
train_X = train_X[numeric_cols]

test_X = test_X[numeric_cols]

train_X = train_X.fillna(train_X.mean())

test_X = test_X.fillna(test_X.mean())

from xgboost import XGBRegressor

XGBmodel = XGBRegressor()

XGBmodel.fit(train_X, train_y, verbose=False)
predictions = XGBmodel.predict(test_X)
XGB_Submission = pd.read_csv(
    r'I:\Data Science Fundamentals\house-prices-advanced-regression-techniques\sample_submission.csv',
    index_col='Id')
XGB_Submission['SalePrice'] = XGBmodel.predict(test_X)
XGB_Submission.to_csv('XGB_Submission.csv')
XGB_Submission.head()

# In[97]:

# Model 5: XGB Classifier
train_X = data_train.loc[:, :
                         'SaleCondition']  #'SaleCondition' is the second last column before 'SalePrice'
train_y = data_train.loc[:, 'SalePrice']

#This DataFrame will be used for the predictions
Exemplo n.º 17
0
print(rmse_list)
hparam = vals[np.argmin(rmse_list)]
print('the best hyperparameter',hparam)
plt.plot(vals,rmse_list) #1963 optimal n_estimators
plt.xlabel('hyperparameter')
plt.ylabel('mean absolute error')
plt.title('hyperparameter tuning')
plt.show()
'''
#model fit for final predictions
model = XGBRegressor(n_estimators=2300,
                     gamma=0.06868686868686869,
                     colsample_bytree=0.8383838384,
                     subsample=0.36969696969696975,
                     reg_lambda=0.4444444444444445,
                     max_depth=1,
                     eval_metric="rmse",
                     reg_alpha=0.4050707070707071)
model.fit(X, y, verbose=False)
predictions = pd.Series(model.predict(predict_these))
predictions.to_csv("Submission.csv")
f = open("Submission.csv", "r")
f_out = open("Submission_out.csv", "w")
for line in f.readlines():
    line = line.split(',')
    line[0] = str(int(line[0]) + 1)
    line = ",".join(line)
    f_out.write(line)
f.close()
f_out.close()
Exemplo n.º 18
0
# -*- coding: utf-8 -*-

from xgboost import XGBRegressor
import pandas as pd

train = pd.read_csv("C:\\Users\\jowet\\Downloads\\Santander\\train.csv")
test = pd.read_csv("C:\\Users\\jowet\\Downloads\\Santander\\test.csv")

train.drop('ID', axis=1, inplace=True)

y_train = train.pop('target')
pred_index = test.pop('ID')

reg = XGBRegressor()
reg.fit(train, y_train)
y_pred = reg.predict(test)

submit = pd.DataFrame()
submit['ID'] = pred_index
submit['target'] = y_pred
submit.to_csv('my_XGB_prediction.csv', index=False)
from sklearn.preprocessing import StandardScaler
standard_scaler = StandardScaler()

# %%
scaled_X_train = standard_scaler.fit_transform(X_train_xgb)
scaled_X_test = standard_scaler.transform(X_test_xgb)

# %%
from xgboost import XGBRegressor
modelXGB = XGBRegressor(n_estimators=2000, learning_rate=0.8)
modelXGB.fit(scaled_X_train,
             y_train_xgb,
             eval_set=[(scaled_X_train, y_train_xgb),
                       (scaled_X_test, y_test_xgb)],
             verbose=True)
modelXGB_pred = modelXGB.predict(scaled_X_test)

# %%
from sklearn.metrics import mean_absolute_error
print('XGBOOST MAE = ', (mean_absolute_error(modelXGB_pred, y_test_xgb)))

# %%
XGBOOST_df = pd.DataFrame({'y': modelXGB_pred})
XGBOOST_df.index = y_test_xgb.index


# %%
def coversion(X, day_new):
    X['day'] = pd.DatetimeIndex(day_new).day
    X['month'] = pd.DatetimeIndex(day_new).month
    X['quarter'] = pd.DatetimeIndex(day_new).quarter
Exemplo n.º 20
0
accuracies = cross_val_score(estimator=regressor, X=X_train, y=y_train, cv=5)
accuracies.mean()
accuracies.std()

# Applying Grid Search to find the best model and the best parameters
from sklearn.model_selection import GridSearchCV
#parameters = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
#              {'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}]
parameters = [{
    'learning_rate': [0.01, 0.1, 0.2],
    'gamma': [0.1, 0.2, 0.3, 0.4, 0.5]
}]
grid_search = GridSearchCV(estimator=regressor,
                           param_grid=parameters,
                           cv=5,
                           n_jobs=-1)
grid_search = grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_

# Predicting new values with test data
y_pred = regressor.predict(X_test)

# Finding the rmse value
import math
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_test, y_pred)
rmse = math.sqrt(mean_squared_error(y_test, y_pred))
Exemplo n.º 21
0
            x1 = joblib.load(data[1])
            X = pd.concat((x0, x1), axis=1)
            del x0, x1

        X_train = X[:num_train]
        X_test = X[num_train:]
        del X

        # X_train, X_val, y_train, y_val = train_test_split(X,
        #                                                  y,
        #                                                  test_size=0.1,
        #                                                  random_state=42)

        xgbr.fit(
            X_train,
            y_train,
            # eval_metric='rmse',
            # early_stopping_rounds=30,
            verbose=True,
            # eval_set=[(X_val, y_val)],
        )

        y_pred = xgbr.predict(X_test)

        res.append(y_pred)

    final_res = np.mean(res, axis=0)
    final_res[final_res < 1] = 1
    final_res[final_res > 3] = 3
    pd.DataFrame({"id": id_test, "relevance": final_res}).to_csv("xgbr_sub_20160423_bag.csv", index=False)
Exemplo n.º 22
0
    imputed_final_test[col + '_was_missing'] = imputed_final_test[col].isnull()

# Imputation
imputer = Imputer()
train_X = imputer.fit_transform(imputed_X_train_plus)
test_X = imputer.transform(imputed_X_test_plus)
output_test = imputer.transform(imputed_final_test)

# train the model
model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
model.fit(train_X,
          train_y,
          early_stopping_rounds=5,
          eval_set=[(test_X, test_y)],
          verbose=False)

# test the actuary
val_predictions = model.predict(test_X)
print("Mean Absolute Error : " +
      str(mean_absolute_error(val_predictions, test_y)))

# output
output_predictions = model.predict(output_test)
print(output_predictions)

my_submission = pd.DataFrame({
    'Id': test_data.Id,
    'SalePrice': output_predictions
})
my_submission.to_csv('submission.csv', index=False)
    test_score = np.sqrt(-cross_val_score(clf, train_x, train_y, cv=10, scoring='neg_mean_squared_error'))
    test_scores.append(np.mean(test_score))
print test_scores
plt.plot(params, test_scores)
plt.title("n_estimators vs CV Error");
# 一定要加上这句才能让画好的图显示在屏幕上
plt.show()
# 将当前figure的图保存到文件result.png
#plt.savefig('./xgboostparams.png')
'''

# XGBRegressor 91 16889
print "XGBRegressor"  
xgb = XGBRegressor(max_depth=6,n_estimators=400)
xgb.fit(X, y)
print mean_absolute_error(val_y,xgb.predict(val_x))
print mean_squared_error(val_y,xgb.predict(val_x))

#gbdt

print "GradientBoostingRegressor"    
gbdt = GradientBoostingRegressor(n_estimators = 1000,max_leaf_nodes = 400)
gbdt.fit(X, y)#17083
#RandomForestRegressor 93  16938
#GradientBoostingRegressor 90 16866
print mean_absolute_error(val_y,gbdt.predict(val_x))
print mean_squared_error(val_y,gbdt.predict(val_x))

#xgb & gbdt
predicted = (xgb.predict(val_x) + gbdt.predict(val_x))/2
print mean_absolute_error(val_y,predicted)
Exemplo n.º 24
0
random_grid = {'n_estimators':n_estimators,
               'max_depth':max_depth,
               'learning_rate':learning_rate}
print(random_grid)

rand_cv = RandomizedSearchCV(estimator = xg_reg, param_distributions=random_grid, scoring= 'neg_mean_squared_error', cv = 10  )

rand_cv.fit(x_train, y_train)

rand_cv.best_params_

xg_reg = XGBRegressor(max_depth=5, learning_rate=0.01, n_estimators= 1000)

xg_reg.fit(x_train,y_train)

y_rg = xg_reg.predict(x_test)

sns.distplot(y_test - y_rg)

print('MAE:', metrics.mean_absolute_error(y_test, y_rg))
print('MSE:', metrics.mean_squared_error(y_test, y_rg))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_rg)))

from lightgbm import LGBMRegressor
lg = LGBMRegressor()

#HyperParameter tuning
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
max_depth = [-5,-3,-2,-1,1,3,5,8,15,20,25,30]
learning_rate = [0.001,0.002,0.005,0.01,0.02,0.05,0.1,0.2,0.5]
from sklearn.model_selection import RandomizedSearchCV
Exemplo n.º 25
0
typ = df_valid.dtypes
df_valid.to_csv('df_valid_cat.csv', header=None, index=False)

df_train.columns
RFR = RandomForestRegressor()
RFR.fit(X_train, Y_train)

RFR_preds = pd.DataFrame(RFR.predict(X_test),columns=['salePrice'],index=Y_test.index)
print(mean_absolute_error(Y_test, RFR_preds))

RFR_new = RFR_preds.apply(lambda x: np.power(np.e,x).astype('int64'))

XGB = XGBRegressor()
XGB.fit(X_train, Y_train, verbose=False)
XGB_preds = pd.DataFrame(XGB.predict(X_test),columns=['salePrice'],index=Y_test.index).astype(int)
print(mean_absolute_error(Y_test,XGB_preds))

XGB_new = XGB_preds.apply(lambda x: np.power(np.e,x).astype('int64'))

GBR = GradientBoostingRegressor()
GBR.fit(X_train, Y_train)
GBR_preds = pd.DataFrame(GBR.predict(X_test),columns=['salePrice'],index=Y_test.index)
print(mean_absolute_error(Y_test,GBR_preds))

GBR_new = GBR_preds.apply(lambda x: np.power(np.e,x).astype('int64'))

sns.swarmplot(x=GBR_preds['salePrice'],y=Y_test)
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
Exemplo n.º 26
0
import time

start = time.time()

for thresh in thresholds:  # 칼럼 수 만큼 돈다!
    selection = SelectFromModel(model, threshold=thresh, prefit=True)
    # threshold= median

    select_x_train = selection.transform(x_train)
    print(select_x_train.shape)

    selection_model = XGBRegressor()
    selection_model.fit(select_x_train, y_train)
    select_x_test = selection.transform(x_test)
    y_predict = selection_model.predict(select_x_test)

    score = r2_score(y_test, y_predict)

    print("Thresh=%.3f, n=%d, R2: %.2f%%" %
          (thresh, select_x_train.shape[1], score * 100.0))

end = time.time() - start
print(end)

import time

start2 = time.time()

for thresh in thresholds:  # 칼럼 수 만큼 돈다!
    selection = SelectFromModel(model, threshold=thresh, prefit=True)
class PrudentialRegressorCVO(BaseEstimator, RegressorMixin):
    def __init__(self,
                objective='reg:linear',
                learning_rate=0.045,
                min_child_weight=50,
                subsample=0.8,
                colsample_bytree=0.7,
                max_depth=7,
                n_estimators=700,
                nthread=-1,
                seed=0,
                n_buckets=8,
                initial_params=[-1.5, -2.6, -3.6, -1.2, -0.8, 0.04, 0.7, 3.6,
                                #1., 2., 3., 4., 5., 6., 7.
                                ],
                minimizer='BFGS',
                scoring=NegQWKappaScorer):

        self.objective = objective
        self.learning_rate = learning_rate
        self.min_child_weight = min_child_weight
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.max_depth = max_depth
        self.n_estimators = n_estimators
        self.nthread = nthread
        self.seed = seed
        self.n_buckets = n_buckets
        self.initial_params = initial_params
        self.minimizer = minimizer
        self.scoring = scoring

        return


    def fit(self, X, y):
        from xgboost import XGBRegressor
        if not KAGGLE:
            from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor

        #from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor
        #self.off = FullDigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets,
        #               basinhopping=True,

        """
2 / 5
grid scores:
  mean: 0.65531, std: 0.00333, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65531

3 / 5
grid scores:
  mean: 0.65474, std: 0.00308, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65474

4 / 5
grid scores:
  mean: 0.65490, std: 0.00302, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65490


2 / 10
grid scores:
  mean: 0.65688, std: 0.00725, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65688

3 / 10
grid scores:
  mean: 0.65705, std: 0.00714, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65705

4 / 10
grid scores:
  mean: 0.65643, std: 0.00715, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65643

5 / 10
grid scores:
  mean: 0.65630, std: 0.00699, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65630

        """
        from sklearn.cross_validation import StratifiedKFold
        kf = StratifiedKFold(y, n_folds=2)
        print(kf)
        params = []
        for itrain, itest in kf:
            ytrain = y[itrain]
            Xtrain = X.iloc[list(itrain)]
            ytest = y[itest]
            Xtest = X.iloc[list(itest)]

            self.xgb = XGBRegressor(
                           objective=self.objective,
                           learning_rate=self.learning_rate,
                           min_child_weight=self.min_child_weight,
                           subsample=self.subsample,
                           colsample_bytree=self.colsample_bytree,
                           max_depth=self.max_depth,
                           n_estimators=self.n_estimators,
                           nthread=self.nthread,
                           missing=0.0,
                           seed=self.seed)
            self.xgb.fit(Xtrain, ytrain)
            te_y_hat = self.xgb.predict(Xtest,
                                        ntree_limit=self.xgb.booster().best_iteration)
            print('XGB Test score is:', -self.scoring(te_y_hat, ytest))

            self.off = DigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets,
                           initial_params=self.initial_params,
                           minimizer=self.minimizer,
                           scoring=self.scoring)
            self.off.fit(te_y_hat, ytest)
            print("Offsets:", self.off.params)
            params += [list(self.off.params)]

            pass

        from numpy import array
        self.off.params = array(params).mean(axis=0)
        print("Mean Offsets:", self.off.params)
        self.xgb.fit(X, y)

        return self


    def predict(self, X):
        from numpy import clip
        te_y_hat = self.xgb.predict(X, ntree_limit=self.xgb.booster().best_iteration)
        return clip(self.off.predict(te_y_hat), 1, 8)

    pass
Exemplo n.º 28
0
                                 options=[1, 2, 3, 4, 5, 6, 7, 8],
                                 index=2)

    # CREATE TRAIN & TEST SETS: training size = 80%  test size =20%
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2)  #,random_state=42)

    # TEST FOR TWO MODELS:
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    y_pred1 = lr.predict(X_test)
    score = r2_score(y_test, y_pred1)

    xgb = XGBRegressor(n_jobs=-1, random_state=42)
    xgb.fit(X_train, y_train)
    y_pred = xgb.predict(X_test)
    #pred = [round(value) for value in y_pred]
    score2 = r2_score(y_test, y_pred)
    # print("R squared score is %.2f%%" % (r2_score(y_test,y_pred)*100.0))

    disp_col.subheader('The average price of the house:')
    disp_col.write(abs(lr.predict([[n_area, n_beds, n_baths]])))
    disp_col.subheader('R squared score of the MLR model:')
    disp_col.write(score)

    disp_col.subheader('R squared score of XGBRegressor model:')
    disp_col.write(score2)

    #disp_col.subheader('Mean squared error of the model:')
    #disp_col.write(mean_squared_error(y_test,predictions))
    #disp_col.subheader('Mean absolute error of the model:')
Exemplo n.º 29
0
        'subsample': [0.6,0.7,0.8],
        'colsample_bytree': [0.6,0.7,0.8]},
        verbose=1, n_jobs=2)
    regressor = gscv.fit(np.array(train), train[goal])
    print(regressor.best_score_)
    print(regressor.best_params_)
else:
    regressor.fit(np.array(train[features]), train[goal])
print '  -> Training time:', time.time() - start

# Evaluation and export result
if sample:
    if not gridsearch:
        # Test results
        if logexp:
            print "RMSPE: " + str(rmspe(map(lambda x : np.exp(x)-1, regressor.predict(np.array(test[features]))),test[goal].values))
        else:
            print "RMSPE: " + str(rmspe(regressor.predict(np.array(test[features])),test[goal].values))
else:
    csvfile = 'result/' + regressor.__class__.__name__ + '-submit.csv'
    with open(csvfile, 'w') as output:
        predictions = []
        for i in test[myid].tolist():
            # stores that haven't opened will have 0 sales
            if test[test[myid] == i]['Open'].item() == 0:
                predictions += [[i,0]]
            else:
                # import pdb;pdb.set_trace()
                if logexp:
                    predictions += [[i,np.exp(regressor.predict(np.array(test[test[myid]==i][features]))[0])-1]]
                else:
class PrudentialRegressorFO(BaseEstimator, RegressorMixin):
    def __init__(self,
                objective='reg:linear',
                learning_rate=0.045,
                min_child_weight=50,
                subsample=0.8,
                colsample_bytree=0.7,
                max_depth=7,
                n_estimators=700,
                nthread=-1,
                seed=0,
                n_buckets=8,
                initial_params=[-1.5, -2.6, -3.6, -1.2, -0.8, 0.04, 0.7, 3.6,
                                #1., 2., 3., 4., 5., 6., 7.
                                ],
                minimizer='BFGS',
                scoring=NegQWKappaScorer):

        self.objective = objective
        self.learning_rate = learning_rate
        self.min_child_weight = min_child_weight
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.max_depth = max_depth
        self.n_estimators = n_estimators
        self.nthread = nthread
        self.seed = seed
        self.n_buckets = n_buckets
        self.initial_params = initial_params
        self.minimizer = minimizer
        self.scoring = scoring

        return


    def fit(self, X, y):
        from xgboost import XGBRegressor
        if not KAGGLE:
            from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor

        self.xgb = XGBRegressor(
                       objective=self.objective,
                       learning_rate=self.learning_rate,
                       min_child_weight=self.min_child_weight,
                       subsample=self.subsample,
                       colsample_bytree=self.colsample_bytree,
                       max_depth=self.max_depth,
                       n_estimators=self.n_estimators,
                       nthread=self.nthread,
                       missing=0.0,
                       seed=self.seed)
        from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor
        self.off = FullDigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets,
#                       basinhopping=True,
                       initial_params=self.initial_params,
                       minimizer=self.minimizer,
                       scoring=self.scoring)

        self.xgb.fit(X, y)

        tr_y_hat = self.xgb.predict(X,
                                    ntree_limit=self.xgb.booster().best_iteration)
        print('Train score is:', -self.scoring(tr_y_hat, y))
        self.off.fit(tr_y_hat, y)
        print("Offsets:", self.off.params)

        return self


    def predict(self, X):
        from numpy import clip
        te_y_hat = self.xgb.predict(X, ntree_limit=self.xgb.booster().best_iteration)
        return clip(self.off.predict(te_y_hat), 1, 8)

    pass
Exemplo n.º 31
0
# In[210]:

result = grid_search.fit(train,y)
# summarize results
print("Best: %f using %s" % (result.best_score_, result.best_params_))


# In[211]:

model=XGBRegressor(learning_rate=0.3,n_estimators=100)
for traincv,testcv in kfold:
    model.fit(train.iloc[traincv],y.iloc[testcv])



# In[212]:

y_pred=model.predict(test)


# In[213]:

output2 = pd.DataFrame( data={"outlet_no":outlet,"total_sales_Actual": y_pred} )
output2.to_csv("model.csv", index=False,quoting=3)


# In[ ]:



#Build the model
#model=ExtraTreesRegressor()
#model=RandomForestRegressor()
#params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
#                  'learning_rate': 0.01, 'loss': 'ls'}
params = {'n_estimators': 400, 'max_depth': 7}
#model=GradientBoostingRegressor(**params)
model=XGBRegressor(**params)
#model=GaussianNB()
#model=Ridge()
#model=KNeighborsRegressor()
#model=DecisionTreeRegressor()
model.fit(train_dataset,train_target)

#Predict with the model
predictions=model.predict(test_dataset)


# In[51]:

### Cross Validation ###

#cv = StratifiedKFold(train_dataset, n_folds=5)

###scoring
scores = cross_validation.cross_val_score(model, train_dataset, train_target, cv=5)
print "Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)

### getting the predictions ###
#predicted = cross_validation.cross_val_predict(clf, train_dataset, train_target, cv=10)
#print metrics.accuracy_score(train_target, predicted)
model = xgb.fit(X_train, Y_train)
# model.feature_importances_;

from xgboost import XGBRegressor

#Get Data
Y_train = train_df['price_doc'].values
X_train = train_df.ix[:, train_df.columns != 'price_doc'].values
X_test = test_df.values
#Init Model
xgb = XGBRegressor(learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.7)
#Train Model
model = xgb.fit(X_train, Y_train)
#Make Predictions
predictions = xgb.predict(X_test)


#Make Submission File
submission_df = pd.DataFrame({'id':test_full['id'], 'price_doc':predictions})
submission_df.to_csv('xgb-added_features.csv', index=False)

################################### SVM ############################################
# from sklearn.decomposition import PCA

# pca = PCA(n_components=0.8, whiten=True) 
# train_x = pca.fit_transform(X_train) 
# test_x = pca.transform(X_test) 
# svm_dr = svm.SVC(kernel='rbf', C=10) 
# svm_dr.fit(train_x, ravel(Y_train)) 
# predictions=svm_dr.predict(test_x)
Exemplo n.º 34
0
def Model(train_linear, test_linear):
    train_linear_fea=train_linear.drop(columns=['SalePrice'])
    train_linear_tar=train_linear.SalePrice
    x_train, x_test, y_train, y_test = train_test_split(train_linear_fea, train_linear_tar,test_size=0.2, random_state=0)
    def evaluate(model, test_features, test_labels,train_features, train_labels):
        predictions = model.predict(test_features)
        errors = abs(predictions - test_labels)
        mape = 100 * np.mean(errors / test_labels)
        accuracy = 100 - mape
        print('Model Performance')
        print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
        print('Accuracy = {:0.2f}%.'.format(accuracy))    
        print("MSE for train data is: %f" % mean_squared_error(y_train, model.predict(x_train)))
        print("MSE for validation data is: %f" % mean_squared_error(y_test, model.predict(x_test)))
        return accuracy
    real_train_tar=np.expm1(train_linear_tar)
    """
        . Lasso model
    """
    
    lassocv = LassoCV(alphas = np.logspace(-5, 4, 400), )
    lassocv.fit(train_linear_fea, train_linear_tar)
    lassocv_score = lassocv.score(train_linear_fea, train_linear_tar)
    lassocv_alpha = lassocv.alpha_
    print("Best alpha : ", lassocv_alpha, "Score: ",lassocv_score)
    
    start=time.time()
    lasso =Lasso(normalize = True)
    lasso.set_params(alpha=lassocv_alpha,max_iter = 10000)
    lasso.fit(x_train, y_train)
    end=time.time()
    mean_squared_error(y_test, lasso.predict(x_test))
    coef_lasso=pd.Series(lassocv.coef_, index=x_train.columns).sort_values(ascending =False)
    evaluate(lasso,x_test,y_test,x_train,y_train)
    print('Time elapsed: %.4f seconds' % (end-start))
    
    y_lasso_predict=lasso.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,np.expm1(y_lasso_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
    
    test_prediction_lasso=np.expm1(lasso.predict(test_linear))
    
    
    """
        . Ridge model
    """
    
    ridgecv = RidgeCV(alphas = np.logspace(-5, 4, 400))
    ridgecv.fit(x_train, y_train)
    ridgecv_score = ridgecv.score(x_train, y_train)
    ridgecv_alpha = ridgecv.alpha_
    print("Best alpha : ", ridgecv_alpha, "Score: ",ridgecv_score)
    coef=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False)
    
    start=time.time()
    ridge =Ridge(normalize = True)
    ridge.set_params(alpha=ridgecv_alpha,max_iter = 10000)
    ridge.fit(x_train, y_train)
    end=time.time()
    mean_squared_error(y_test, ridge.predict(x_test))
    coef_ridge=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False)
    evaluate(ridge,x_test,y_test,x_train,y_train)
    print('Time elapsed: %.4f seconds' % (end-start))
    
    y_ridge_predict=ridge.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,np.expm1(y_ridge_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
    
    test_prediction_ridge=np.expm1(ridge.predict(test_linear))
    
    
    """
        . Random Forest
    """
    #train=train.drop(columns=['DateSold'])
    #test=test.drop(columns=['DateSold'])
    #X_train=train.drop(columns=['SalePrice'])
    #Y_train=train['SalePrice']
    X_train=train_linear_fea
    Y_train=train_linear_tar
    x_train_rf, x_test_rf, y_train_rf, y_test_rf = train_test_split(X_train, Y_train,test_size=0.2, random_state=0)
    
    
    n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)]
    max_features = ['auto', 'sqrt']
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    min_samples_split = [2, 5, 10]
    min_samples_leaf = [1, 2, 4]
    bootstrap = [True, False]
    random_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap}
    
    rf = RandomForestRegressor()
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    #
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
    rf_random.fit(X_train, Y_train)
    #rf_random.fit(x_train_rf, y_train_rf)
    rf_random.best_params_
    
    #Random search allowed us to narrow down the range for each hyperparameter. Now that we know where to concentrate our search,
    # we can explicitly specify every combination of settings to try. 
    param_grid = {
        'bootstrap': [False],
        'max_depth': [80, 90, 100, 110,120,130],
        'max_features': [2, 3],
        'min_samples_leaf': [1,2,3, 4],
        'min_samples_split': [2,4,6,8, 10, 12],
        'n_estimators': [600,700, 800, 900, 1000]
    }
    # Create a based model
    rf = RandomForestRegressor()
    # Instantiate the grid search model
    grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2)
    #grid_search.fit(x_train, y_train)
    grid_search.fit(X_train, Y_train)
    grid_search.best_params_
    
    best_random = grid_search.best_estimator_
    start=time.time()
    best_random.fit(x_train_rf,y_train_rf)
    end=time.time()
    evaluate(best_random, x_test_rf, y_test_rf,x_train_rf,y_train_rf)
    print('Time elapsed: %.4f seconds' % (end-start))
    
    y_rf_predict=best_random.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,np.expm1(y_rf_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
    importance_rf = pd.DataFrame({'features':train_linear_fea.columns, 'imp':best_random.feature_importances_}).\
                            sort_values('imp',ascending=False)
    
    importance_top20_rf = importance_rf.iloc[:20,]
    
    plt.barh(importance_top20_rf.features, importance_top20_rf.imp)
    plt.xlabel('Feature Importance')
    
    test_prediction_rf=np.expm1(best_random.predict(test_linear))
    
    """
        . Xgboost
    """
    
    learning_rate = [round(float(x), 2) for x in np.linspace(start = .1, stop = .2, num = 11)]
        # Minimum for sum of weights for observations in a node
    min_child_weight = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
        # Maximum nodes in each tree
    max_depth = [int(x) for x in np.linspace(1, 10, num = 10)]
    n_estimators=[int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)]
    subsample=[0.3, 0.4,0.5,0.6, 0.7]
    model = xgb.XGBRegressor()
    random_grid = {'learning_rate': learning_rate,
                    'max_depth': max_depth,
                    'min_child_weight': min_child_weight,
                    'subsample': subsample,
                    'n_estimators':n_estimators
                    }
    
        # Make a RandomizedSearchCV object with correct model and specified hyperparams
    xgb_random = RandomizedSearchCV(estimator=model, param_distributions=random_grid, n_iter=1000, cv=5, verbose=2, random_state=42, n_jobs=-1)
    start = time.time()
        # Fit models
    xgb_random.fit(X_train, Y_train)
    xgb_random.best_params_
    
    
    """
    best_params_={'learning_rate': 0.1,
     'max_depth': 2,
     'min_child_weight': 4,
     'n_estimators': 900,
     'subsample': 0.5}
    """
    model_xgb = XGBRegressor(**xgb_random.best_params_)
    #model_xgb = XGBRegressor(**best_params_)
    start=time.time()
    model_xgb.fit(x_train_rf,y_train_rf)
    end=time.time()
    evaluate(model_xgb, x_test_rf, y_test_rf,x_train_rf,y_train_rf)
    print('Time elapsed: %.4f seconds' % (end-start))
    
    
    
    y_xgb_predict=model_xgb.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,np.expm1(y_xgb_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
    importance_xgb = pd.DataFrame({'features':train_linear_fea.columns, 'imp':model_xgb.feature_importances_}).\
                            sort_values('imp',ascending=False)
    
    importance_top20_xgb = importance_xgb.iloc[:20,]
    
    plt.barh(importance_top20_xgb.features, importance_top20_xgb.imp)
    plt.xlabel('Feature Importance')
    
    test_prediction_xgb=np.expm1(model_xgb.predict(test_linear))
    
    return(test_prediction_lasso, test_prediction_ridge, test_prediction_rf, test_prediction_xgb,y_lasso_predict, y_ridge_predict, y_rf_predict, y_xgb_predict)
Exemplo n.º 35
-1
def XgBoost(train_linear, test_linear):
    learning_rate = [round(float(x), 2) for x in np.linspace(start = .1, stop = .2, num = 11)]
        # Minimum for sum of weights for observations in a node
    min_child_weight = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
        # Maximum nodes in each tree
    max_depth = [int(x) for x in np.linspace(1, 10, num = 10)]
    n_estimators=[int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)]
    subsample=[0.3, 0.4,0.5,0.6, 0.7]
    model = xgb.XGBRegressor()
    random_grid = {'learning_rate': learning_rate,
                    'max_depth': max_depth,
                    'min_child_weight': min_child_weight,
                    'subsample': subsample,
                    'n_estimators':n_estimators
                    }
    
        # Make a RandomizedSearchCV object with correct model and specified hyperparams
    xgb_random = RandomizedSearchCV(estimator=model, param_distributions=random_grid, n_iter=1000, cv=5, verbose=2, random_state=42, n_jobs=-1)
    start = time.time()
        # Fit models
    xgb_random.fit(X_train, Y_train)
    xgb_random.best_params_
    
    from xgboost import XGBRegressor
    """
    best_params_={'learning_rate': 0.1,
     'max_depth': 2,
     'min_child_weight': 4,
     'n_estimators': 900,
     'subsample': 0.5}
    """
    model_xgb = XGBRegressor(**xgb_random.best_params_)
    #model_xgb = XGBRegressor(**best_params_)
    start=time.time()
    model_xgb.fit(x_train_rf,y_train_rf)
    end=time.time()
    evaluate(model_xgb, x_test_rf, y_test_rf,x_train_rf,y_train_rf)
    print('Time elapsed: %.4f seconds' % (end-start))
    
    
    
    y_xgb_predict=model_xgb.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,np.expm1(y_xgb_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
    importance_xgb = pd.DataFrame({'features':train_linear_fea.columns, 'imp':model_xgb.feature_importances_}).\
                            sort_values('imp',ascending=False)
    importance_xgb=importance_xgb[importance_xgb['features']!='Id']
    
    importance_top20_xgb = importance_xgb.iloc[:20,]
    
    plt.barh(importance_top20_xgb.features, importance_top20_xgb.imp)
    plt.xlabel('Feature Importance')
    
    test_prediction_xgb=np.expm1(model_xgb.predict(test_linear))
    write_pkl(xgb_random.best_params_, '/Users/vickywinter/Documents/NYC/Machine Learning Proj/Pickle/xgb_params.pkl')
    return test_prediction_xgb