예제 #1
0
def main():

    xgbInput = dataEngr.clfInput()
    xgbInput.get_sessionsFtr()
    xgbInput.users_ftrEng()
    xgbInput.one_hot()
    #xgbInput.binarize_targets()
    xgbInput.split_data()
    
    #parameters to use to train the model
    param = {}
    param['eta'] = 0.14
    param['max_depth'] = 6
    param['subsample'] = .9
    param['colsample_bytree'] = .45
    nrounds = 102

    bst = train_xgb(xgbInput.train_X, xgbInput.train_Y, param, nrounds)
    with open('../xgbmodels/final/actionsUS_e13_102n.p', 'wb') as f:
        pickle.dump(bst, f)

    #predict and get submissions
    submission = get_submission(bst, xgbInput.test_X, xgbInput.testDf.index, xgbInput.le)
    submission.to_csv('../submissions/final/actionsUS_e13_102n.csv', index=False)
예제 #2
0
#parameter search by cross validating on only most recent data, folds are by month

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import preprocessing
from sklearn.grid_search import ParameterGrid
import kaggle_xgb
import calc_ndcg
import dataEngr
import pickle

#read in data and do feature engineering for all columns but the target
xgbInput = dataEngr.clfInput()
xgbInput.get_sessionsFtr('actions4.p')
xgbInput.users_ftrEng()
xgbInput.one_hot()
#xgbInput.binarize_targets()
xgbInput.split_bySess()

param = {'num_class': 12, 'silent': 1, 'objective': 'multi:softprob'}

param_grid = {}
param_grid['eta'] = [.13]
param_grid['max_depth'] = [6]
param_grid['subsample'] = [.9]
param_grid['colsample_bytree'] = [.7]
nrounds = 200

#set up dataframe to store cross-validation results form each iteration
col_names = ['test-error-mean', 'test-error-std', 'train-error-mean',