def main(): xgbInput = dataEngr.clfInput() xgbInput.get_sessionsFtr() xgbInput.users_ftrEng() xgbInput.one_hot() #xgbInput.binarize_targets() xgbInput.split_data() #parameters to use to train the model param = {} param['eta'] = 0.14 param['max_depth'] = 6 param['subsample'] = .9 param['colsample_bytree'] = .45 nrounds = 102 bst = train_xgb(xgbInput.train_X, xgbInput.train_Y, param, nrounds) with open('../xgbmodels/final/actionsUS_e13_102n.p', 'wb') as f: pickle.dump(bst, f) #predict and get submissions submission = get_submission(bst, xgbInput.test_X, xgbInput.testDf.index, xgbInput.le) submission.to_csv('../submissions/final/actionsUS_e13_102n.csv', index=False)
#parameter search by cross validating on only most recent data, folds are by month import numpy as np import pandas as pd import xgboost as xgb from sklearn import preprocessing from sklearn.grid_search import ParameterGrid import kaggle_xgb import calc_ndcg import dataEngr import pickle #read in data and do feature engineering for all columns but the target xgbInput = dataEngr.clfInput() xgbInput.get_sessionsFtr('actions4.p') xgbInput.users_ftrEng() xgbInput.one_hot() #xgbInput.binarize_targets() xgbInput.split_bySess() param = {'num_class': 12, 'silent': 1, 'objective': 'multi:softprob'} param_grid = {} param_grid['eta'] = [.13] param_grid['max_depth'] = [6] param_grid['subsample'] = [.9] param_grid['colsample_bytree'] = [.7] nrounds = 200 #set up dataframe to store cross-validation results form each iteration col_names = ['test-error-mean', 'test-error-std', 'train-error-mean',