-
Notifications
You must be signed in to change notification settings - Fork 0
/
train.py
92 lines (73 loc) · 2.79 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#-*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import eval
import genfeat
import xgboost as xgb
ACTION_201602_FILE = "data/JData_Action_201602.csv"
ACTION_201603_FILE = "data/JData_Action_201603.csv"
ACTION_201604_FILE = "data/JData_Action_201604.csv"
COMMENT_FILE = "data/JData_Comment.csv"
PRODUCT_FILE = "data/JData_Product.csv"
USER_FILE = "data/JData_User.csv"
NEW_USER_FILE = "cache/JData_User_New.csv"
ALL_ACTION_FILE = 'sub/raw_all_Action.csv'
ALL_PRODUCT_FILE = 'sub/all_product.csv'
ALL_USER_FILE = 'sub/all_user.csv'
MINI_USER_TRAIN = 'sub/mini_user_train.csv'
MINI_USER_TEST = 'sub/mini_user_test.csv'
MINI_ACT_TRAIN = 'sub/mini_act_train.csv'
MINI_ACT_TEST = 'sub/mini_act_test.csv'
USER_TRAIN = 'sub/user_train.csv'
USER_VALID = 'sub/user_valid.csv'
USER_TEST = 'sub/user_test.csv'
NEW_PRODUCT = 'sub/product.csv'
MINI_TRAIN_LABEL = 'sub/mini_train_label.csv'
MINI_TEST_LABEL = 'sub/mini_test_label.csv'
RESULT_FILE = 'res/result.csv'
TMP_ACT_SUBMIT = 'cache/act_submit.csv'
TMP_USER_SUBMIT = 'cache/user_submit.csv'
# Display format
pd.options.display.float_format = '{:,.3f}'.format
# raw
df_r, label_r, users_r = genfeat.get_train_data()
# test
df_t, label_t, users_t = genfeat.get_test_data()
# UnderSample
df_r, label_r, users_r = genfeat.underSample(df_r, label_r, users_r, prob = 0.03)
dtrain=xgb.DMatrix(df_r, label=label_r)
dtest=xgb.DMatrix(df_t, label=label_t)
param = {'learning_rate' : 0.1, 'n_estimators': 1000, 'max_depth': 3,
'min_child_weight': 5, 'gamma': 0, 'subsample': 1.0, 'colsample_bytree': 0.8,
'scale_pos_weight': 1, 'eta': 0.05, 'silent': 1, 'objective': 'binary:logistic'}
num_round = 283
param['nthread'] = 4
param['eval_metric'] = "auc"
plst = list(param.items())
plst += [('eval_metric', 'logloss')]
evallist = [(dtest, 'eval'), (dtrain, 'train')]
bst=xgb.train( plst, dtrain, num_round, evallist)
'''
sub_user_index, sub_trainning_date, sub_label = make_train_set(sub_start_date, sub_end_date,
sub_test_start_date, sub_test_end_date)
'''
def test(threshold = 0.5):
dtest = xgb.DMatrix(df_t)
y = bst.predict(dtest)
pred = pd.concat([users_t,pd.DataFrame(y)],axis=1,ignore_index=False)
pr = pred[pred[0]> threshold]
del pr[0]
yture = pd.concat([users_t, pd.DataFrame(label_t)], axis =1)
yture = yture[yture['label']>0]
pr = pr.drop_duplicates('user_id')
return eval.eval(pr,yture, True)
def submit():
df_s,users_s = genfeat.gen_submit_data()
dtest = xgb.DMatrix(df_s)
y = bst.predict(dtest)
pred = pd.concat([users_s,pd.DataFrame(y)],axis=1,ignore_index=False)
pred = pred[pred[0]>0.8]
del pred[0]
pred = pred.drop_duplicates('user_id')
pred['user_id'] = pred['user_id'].astype(np.int32)
pred.to_csv(RESULT_FILE, index=False)