-
Notifications
You must be signed in to change notification settings - Fork 3
/
tree.features.py
85 lines (66 loc) · 2.11 KB
/
tree.features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import pandas as pd
import sklearn as sk
import numpy as np
from csv import DictReader
from scipy.sparse import vstack
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import SGDClassifier
SEED = 80085
train_loc = 'train.day.hour.csv'
test_loc = 'test.day.hour.csv'
# load and prepare training data
def load_train_data(path):
train = pd.read_csv(path, dtype = str, nrows = 4000000, low_memory = False)
click = train['click'].astype(int)
train = train.drop('click', 1)
train = train.drop('id', 1)
for colname in list(train.columns.values):
train[colname] = pd.Categorical.from_array(train[colname]).codes
return train, click
# load and prepare testing data
def load_test_data(path):
test = pd.read_csv(path, dtype = str, low_memory = False)
test = test.drop('id', 1)
for colname in list(test.columns.values):
test[colname] = pd.Categorical.from_array(test[colname]).codes
return test
def main():
# initialize sklearn objects
rf = RandomForestClassifier(n_estimators = 300, max_depth = 3, verbose = 1, random_state = SEED)
logitsgd = SGDClassifier(loss ='log', n_jobs = -1, verbose = 1)
encoder = OneHotEncoder()
train, click = load_train_data(train_loc)
# rf feature transformation
rf.fit(train, click)
train_rf = rf.apply(train)
train = None
# encode rf features for logit
print('fitting encoder ... ')
encoder.fit(train_rf)
print('transforming ...')
embedded = []
for row in train_rf:
embedded = vstack((embedded, encoder.transform(row)))
train_rf = None
# train model
logitsgd.fit(X = embedded, y = click)
embedded = None
# load testing data
test = load_test_data(test_loc)
# rf transform test
test_rf = rf.apply(test)
test = None
# encode test
print('transforming ...')
embedded = []
for row in test_rf:
embedded = vstack((embedded, encoder.transform(row)))
test_rf = None
# make predictions
prediction = logitsgd.predict_proba(embedded_test)
# save predictions
prediction = np.array(prediction)
np.savetxt("predictions.csv", prediction, delimiter = ",")
if __name__ == '__main__':
main()