Exemplo n.º 1
0
def run_model(read_data_datapath, save_model_path):

	# read data
	x_train, x_test, y_train, y_test = util.prepare_train_test_set(read_data_datapath, 0.001)

	# choose model
	clf = xgb.XGBRegressor(seed = 2017)

	# grid search for the best fit parameters
	param_grid = {

		'gamma': [0.0, 0.2, 0.4], # Minimum loss reduction required to make a further partition on a leaf node of the tree
		'max_depth': [3, 5, 7, 10], # in place of max_leaf_nodes
		'min_child_weight': [0.1, 1, 2], # Minimum sum of instance weight(hessian) needed in a child, in the place of min_child_leaf
		'n_estimators': [50, 100, 200, 250, 300], # Number of boosted trees to fit
		'reg_alpha': [0.1, 0.5, 1.0], # L1 regularization term on weights
		'reg_lambda': [0.1, 0.5, 1.0] # L2 regularization term on weights

	}
	CV_clf = GridSearchCV(estimator=clf, param_grid=param_grid, cv=4, scoring='neg_mean_squared_error')

	#CV_clf.fit(x_train[1:100,:], y_train[1:100])
	CV_clf.fit(x_train, y_train)


	# save model to pickle
	pickle.dump(CV_clf, open(save_model_path, "wb"))
	print('The best parameters are: \n %s' %CV_clf.best_params_)


	# run model and return loss
	#train_loss, test_loss = util.quick_test_model(x_train[1:100,:], x_test[1:100,:], y_train[1:100], y_test[1:100], CV_clf, regression_loss)
	train_loss, test_loss = util.quick_test_model(x_train, x_test, y_train, y_test, CV_clf, regression_loss)
	print("Train loss is %s, \n Test loss is %s  " % (train_loss, test_loss))
Exemplo n.º 2
0
def run_model(read_data_datapath, save_model_path):
    # read data
    x_cv, x_test, y_cv, y_test = util.prepare_train_test_set(read_data_datapath,0.005)
    # choose model
    clf = Pipeline([('clf',DecisionTreeRegressor(criterion='mse',random_state=0))])
    #clf = DecisionTreeRegressor(criterion='mse',random_state=0)
    # grid search for the best fit parameters
    parameters = {
        #'clf__max_depth': [125, 100, 75, 50, 40, 30, 25, 20, 15, 10, 5],
        #'clf__min_samples_split': [2, 3, 4 ,5, 6],
        'clf__min_samples_leaf': [1, 2, 3, 4 ,5, 6]  
    }
        
    CV_clf = GridSearchCV(estimator=clf, param_grid = parameters, cv=3, scoring='neg_mean_squared_error')

    #CV_clf.fit(x_cv[1:100,:], y_cv[1:100])
    CV_clf.fit(x_cv, y_cv)
    print ('Best score: %0.3f' % CV_clf.best_score_)
    # save model to pickle
    pickle.dump(CV_clf, open(save_model_path, "wb"))
    print ('Best parameters set are: \n %s' %  CV_clf.best_estimator_.get_params())


    # run model and return loss
    train_loss, test_loss = util.quick_test_model(x_cv, x_test, y_cv, y_test, CV_clf, regression_loss)
    #train_loss, test_loss = util.quick_test_model(x_cv[1:100,:], x_test[1:100,:], y_cv[1:100], y_test[1:100], CV_clf, regression_loss)
    print("Train loss is %s, \n Test loss is %s  " % (train_loss, test_loss))
Exemplo n.º 3
0
def number_best_feature_set(read_data_datapath):
	min_train_loss = 100
	threshold = 0
	for i in np.array([0.001, 0.002, 0.003, 0.004, 0.005, 0.006]):
		x_train, x_test, y_train, y_test = util.prepare_train_test_set(read_data_datapath, i)
		clf = xgb.XGBRegressor(seed = 2017)
		clf.fit(x_train, y_train)
		train_loss, test_loss = util.quick_test_model(x_train, x_test, y_train, y_test, clf, regression_loss)
		print (train_loss, i)
Exemplo n.º 4
0
# Import the necessary modules and libraries
#from importlib import reload
#reload(util)
import util
from util import regression_loss
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import pickle

# read the data
x_cv, x_test, y_cv, y_test = util.prepare_train_test_set('./data/df.p')

# choose model

clf = Pipeline([('clf', RandomForestRegressor(criterion='mse',
                                              random_state=0))])
#clf = RandomForestRegressor(criterion='mse',random_state=0)

# grid search for the best fit parameters

parameters = {
    'clf__n_estimators': (50, 40, 30, 20, 10),  #number of trees
    'clf__max_depth': (75, 50, 40, 30, 25, 10),
    'clf__min_samples_split': (2, 3, 4, 5),
    'clf__min_samples_leaf': (2, 3, 4, 5),
    'clf__min_impurity_split': (1e-8, 1e-7, 1e-6, 1e-5)
Exemplo n.º 5
0
import util
import numpy as np
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import GridSearchCV

# read the data
datapath = './data/df.p'
x_cv, x_test, y_cv, y_test = util.prepare_train_test_set(datapath)

# exponential
#y_cv = np.exp(y_cv)
#y_test = np.exp(y_test)

# choose model
clf = Ridge(max_iter=3000)

# parameters
param_grid = {'alpha': np.logspace(-2, 4, num=13)}

# grid search
CV_clf = GridSearchCV(estimator=clf,
                      param_grid=param_grid,
                      cv=2,
                      scoring='neg_mean_squared_error')
CV_clf.fit(x_cv, y_cv)
CV_result = CV_clf.cv_results_
best_score = np.sqrt(-CV_clf.best_score_)
print('The best parameters are: %s' % CV_clf.best_params_)
print('The best RMSE is: %.3f' % best_score)

# visualizing purpose
import pickle
import sys
import os
import util
import pandas as pd
from util import regression_loss
#os.chdir('/Users/Sean/Desktop/DS1003_Final_Project')
#sys.path.append('/Users/Sean/Desktop/DS1003_Final_Project')

# entire
datapath = './data/encoded_entire.pkl'
pkl_file = open(datapath, 'rb')
dataset = pickle.load(pkl_file)
read_model_path = "./model/xgb_final/xgb_entire_model_final.pkl"
model_entire = pickle.load(open(read_model_path, "rb"))
x_train_entire, x_test_entire, y_train_entire, y_test_entire = util.prepare_train_test_set(
    './data/encoded_entire.pkl', 0.001)
prediction_entire = model_entire.predict(x_test_entire)

# private
datapath = './data/encoded_private.pkl'
pkl_file = open(datapath, 'rb')
dataset = pickle.load(pkl_file)
read_model_path = "./model/xgb_final/xgb_private_model_final.pkl"
model_private = pickle.load(open(read_model_path, "rb"))
x_train_private, x_test_private, y_train_private, y_test_private = util.prepare_train_test_set(
    './data/encoded_private.pkl', 0.001)
prediction_private = model_private.predict(x_test_private)

prediction_entire = pd.DataFrame({
    'y_test_entire': list(y_test_entire),
    'prediction_entire': list(prediction_entire)