-
Notifications
You must be signed in to change notification settings - Fork 0
/
submission.py
392 lines (314 loc) · 14.8 KB
/
submission.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
"""
Kaggle : StumbleUpon Evergreen Classification Challenge
Author : log0
Contact : im __dot__ ckieric __at__ gmail __dot__ com
============================================================================================
Comments:
This is my 13th submission, though I did not select this into my final private leaderboard.
This is actually my best submission, which bested the other two I selected. This is a good
example of how overfitting can pull you down. I hope this will serve to me and the others
a good lesson of how higher dimension engineered features could be hazardous, potentially.
CV score: 0.888209675529
Public score: 0.88415 (Rank 57)
Private score: 0.87967 (Rank 291)
Note that due to the complexity and messiness of the code, this does not run by itself. But
the gist of the solution logic is here.
Enjoy!
============================================================================================
Solution Description:
First, I manually processed the body, title and url and use a custom word tokenizer on
normalized text which then I feed into a TfidfVectorizer. Along with it, I engineered
some features from the document contents via analyzing sentence POS tags. On top of it,
and this is where I made my demise... I engineered some higher dimensional features which
is a perfect recipe for overfitting (which the CV did not tell). Lastly, I applied a
Latent Semantic Analysis (LSA) with (400-500 components) then used an LogisticRegression.
along with TfidfVectorizer (carefully not looking into the test features, of course).
============================================================================================
Log output:
2013-10-31 14:41:19.118000
Reading 10000 data
Max data read : 7395
Max data read : 3171
Loaded from cache
bestwords count = 58088
train vector ready
Final X shape : (7395, 58143)
submission vector ready
2013-10-31 14:48:58.784000
0.888209675529 | [ 0.89300066 0.88945616 0.8989167 0.8817395 0.87793536]
2013-10-31 14:55:36.726000
============================================================================================
"""
import os
import re
import csv
import json
import numpy as np
import pickle
import datetime
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.feature_selection import *
from sklearn.linear_model import *
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.svm import SVC, NuSVC
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.decomposition import *
from sklearn.cross_validation import cross_val_score, KFold, StratifiedKFold
from sklearn import preprocessing
from sklearn import metrics
from text.blob import TextBlob
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize, wordpunct_tokenize
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures
from nltk.corpus import stopwords
import collections, itertools
import nltk.classify.util, nltk.metrics
import scipy.sparse as sparse
import data_io
import features
r_nonalnum = re.compile(r'\W+')
r_nonans = re.compile(r'[^\w\s]+')
r_digit = re.compile(r'\d+')
r_url = re.compile(r'https?://')
r_nonword = re.compile(r'[A-Z\d]+')
def get_bestwords(contents, labels, limit = 10000, n = None, cache = True):
if cache:
if n:
cache_path = 'cache/%s_%s.pkl' % (limit, n)
if os.path.exists(cache_path):
bestwords = pickle.load(open(cache_path, 'r'))
print 'Loaded from cache'
print 'bestwords count = %d' % (len(bestwords))
return bestwords
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
pos_contents = contents[labels == 1]
neg_contents = contents[labels != 0]
pos_words = set()
neg_words = set()
for pos_content in pos_contents:
pos_words = pos_words.union(word_tokenize(pos_content))
for neg_content in neg_contents:
neg_words = neg_words.union(word_tokenize(neg_content))
for word in pos_words:
word_fd.inc(word.lower())
label_word_fd['pos'].inc(word.lower())
for word in neg_words:
word_fd.inc(word.lower())
label_word_fd['neg'].inc(word.lower())
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
(freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
(freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:limit]
bestwords = set([w for w, s in best])
print 'all words count = %d' % (len(word_scores))
print 'bestwords count = %d' % (len(bestwords))
if cache:
if n:
cache_path = 'cache/%s_%s.pkl' % (limit, n)
f = open(cache_path, 'w')
pickle.dump(bestwords, f)
print 'Dumped to cache'
return bestwords
def get_doc_contents(data):
contents = []
blobs = [ json.loads(i) for i in data ]
for i in blobs:
row = {'title': '', 'body': '', 'url': ''}
for key in row.keys():
if i.has_key(key) and i[key]:
row[key] = i[key]
content = ' '.join([ i for i in row.itervalues() ])
contents.append(content)
return np.array(contents)
def scale_vector(v):
return v / float(np.max(v))
def get_char_count_vector(contents, char):
v = [ content.count(char) + 1 for content in contents ]
return np.array(v)
r_digits = re.compile('\d')
def get_digit_count_vector(contents):
global r_digits
v = [ len(r_digits.findall(content)) for content in contents ]
return np.array(v)
def get_meta_vector_2(contents_dict):
v = []
contents = [ content['body'] or '' for content in contents_dict ]
def count_sent(s):
if s != None:
return len(sent_tokenize(s))
else:
return 0
v_count_sent = np.vectorize(count_sent)
def count_url(s):
return len(r_url.findall(s))
v_count_url = np.vectorize(count_url)
ws = {}
ws['sent_count'] = v_count_sent(contents)
ws['url_count'] = v_count_url(contents)
ws['img_count'] = np.array([int(content['img_count'] or 0) for content in contents_dict])
ws['?'] = get_char_count_vector(contents, '?').astype(float)
ws['!'] = get_char_count_vector(contents, '!').astype(float)
ws['.'] = get_char_count_vector(contents, '.').astype(float)
ws[','] = get_char_count_vector(contents, ',').astype(float)
v.append(ws['!']) # good stuff
v.append(ws['.'] ** 2) # good stuff
v = [scale_vector(i) for i in v]
return np.array(v).transpose()
def get_meta_vector(tags, contents):
v = []
ws = {}
ws['nn_vector'] = get_pos_tag_count(tags, ['NN', 'NNS', 'NNP', 'NNPS']).astype(float) # 1.0, 0.5
ws['ad_vector'] = get_pos_tag_count(tags, ['JJ', 'JJR', 'JJS']).astype(float) # 1.0, 0.5
ws['av_vector'] = get_pos_tag_count(tags, ['RB', 'RBR', 'RBS']).astype(float) # 1.0, 0.5,
ws['vb_vector'] = get_pos_tag_count(tags, ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']).astype(float) # 1.0, 0.5, 2.0
ws['wd_vector'] = get_pos_tag_count(tags, ['WDT', 'WP', 'WP$', 'WRB']).astype(float) # 0.5
ws['po_vector'] = get_pos_tag_count(tags, ['PRP', 'PRP$']).astype(float) # 1.0, 0.5
ws['dt_vector'] = get_pos_tag_count(tags, ['DT']).astype(float)
ws['cd_vector'] = get_pos_tag_count(tags, ['CD']).astype(float)
ws['m1_vector'] = get_pos_tag_count(tags, ['DT', 'IN', 'JJ']).astype(float)
ws['in_vector'] = get_pos_tag_count(tags, ['IN']).astype(float)
ws['md_vector'] = get_pos_tag_count(tags, ['MD']).astype(float)
ws['cn_vector'] = get_count_vector(tags).astype(float)
ws['?'] = get_char_count_vector(contents, '?').astype(float)
ws['!'] = get_char_count_vector(contents, '!').astype(float)
ws['.'] = get_char_count_vector(contents, '.').astype(float)
ws[','] = get_char_count_vector(contents, ',').astype(float)
ws['numbers'] = get_digit_count_vector(contents).astype(float)
v.append(ws['nn_vector']) # good stuff
v.append(ws['ad_vector']) # good stuff
v.append(ws['vb_vector']) # good stuff
v.append(ws['po_vector']) # good stuff
v.append(ws['cn_vector']) # good stuff
v.append(ws['?']) # good stuff
v.append(ws['nn_vector'] ** 0.5) # good stuff
v.append(ws['ad_vector'] ** 0.5) # good stuff
v.append(ws['av_vector'] ** 0.5) # good stuff
v.append(ws['vb_vector'] ** 0.5) # good stuff
v.append(ws['wd_vector'] ** 0.5) # good stuff
v.append(ws['po_vector'] ** 0.5) # really good shit
v.append(ws['dt_vector'] ** 0.5) # really good shit, pushes up to 0.88404
v.append(ws['cn_vector'] ** 0.5) # good stuff
v.append(ws['m1_vector'] ** 0.5) # good stuff
v.append(ws['vb_vector'] ** 2) # good stuff
v.append(ws['ad_vector'] ** 3.0) # good stuff
v.append(ws['av_vector'] ** 3.0) # good stuff
v.append(ws['vb_vector'] ** 3.0) # good stuff
v.append(ws['cd_vector'] ** 3.0) # good stuff
v.append(ws['nn_vector'] * ws['ad_vector']) # good stuff
v.append(ws['nn_vector'] * ws['dt_vector']) # good stuff
v.append(ws['nn_vector'] * ws['m1_vector']) # good stuff
v.append(ws['nn_vector'] * ws['md_vector']) # good stuff
v.append(ws['nn_vector'] * ws['?']) # good stuff
v.append(ws['ad_vector'] * ws['av_vector']) # good stuff
v.append(ws['ad_vector'] * ws['vb_vector']) # good stuff
v.append(ws['ad_vector'] * ws['wd_vector']) # good stuff
v.append(ws['ad_vector'] * ws['m1_vector']) # good stuff
v.append(ws['ad_vector'] * ws['cn_vector']) # good stuff
v.append(ws['av_vector'] * ws['wd_vector']) # good stuff
v.append(ws['av_vector'] * ws['?']) # good stuff
v.append(ws['av_vector'] * ws['!']) # good stuff
v.append(ws['av_vector'] * ws['.']) # really good stuff
v.append(ws['vb_vector'] * ws['wd_vector']) # good stuff
v.append(ws['vb_vector'] * ws['?']) # good stuff
v.append(ws['vb_vector'] * ws['!']) # good stuff
v.append(ws['vb_vector'] * ws['.']) # good stuff
v.append(ws['wd_vector'] * ws['po_vector']) # good stuff
v.append(ws['wd_vector'] * ws['dt_vector']) # good stuff
v.append(ws['po_vector'] * ws['?']) # good stuff
v.append(ws['dt_vector'] * ws['cn_vector']) # good stuff
v.append(ws['cd_vector'] * ws[',']) # good stuff
v.append(ws['m1_vector'] * ws['cn_vector']) # no effect!
v.append(ws['m1_vector'] * ws['?']) # good stuff
v.append(ws['m1_vector'] * ws['numbers']) # good stuff
v.append(ws['in_vector'] * ws['?']) # good stuff
v.append(ws['in_vector'] * ws['numbers']) # good stuff
v.append(ws['md_vector'] * ws['?']) # good stuff
v.append(ws['md_vector'] * ws['numbers']) # good stuff
v.append(ws['cn_vector'] * ws['!']) # good stuff
v.append(ws['?'] * ws['.']) # good stuff
v.append(ws[','] * ws['numbers']) # good stuff
v = [scale_vector(i) for i in v]
return np.array(v).transpose()
def get_pos_tag_count(tags, target):
v = [ None for i in xrange(len(tags)) ]
for i, tag in enumerate(tags):
entries = (tag['body'] or []) + (tag['title'] or []) + (tag['url'] or [])
v[i] = sum(1 if entry[1] in target else 0 for entry in entries)
v[i] = v[i] + 1
return np.array(v)
def get_count_vector(tags):
count_vector = [ 0 for i in xrange(len(tags)) ]
for i, tag in enumerate(tags):
count_body = len(tag['body'] or [])
count_title = len(tag['title'] or [])
count_url = len(tag['url'] or [])
count_vector[i] = count_body + count_title + count_url + 1
return np.array(count_vector)
if __name__ == '__main__':
print datetime.datetime.now()
np.random.seed(1500) # 500 is original
train_path = 'data/train.tsv'
submission_path = 'data/test.tsv'
n = 10000
tags = pickle.load(open('cache/tagged.%s.pkl' % (n)))
custom_contents = np.array(pickle.load(open('cache/custom_contents.%s.pkl' % (n))))
submission_custom_contents = np.array(pickle.load(open('cache/s.custom_contents.pkl')))
submission_tags = pickle.load(open('cache/s.tagged.pkl'))
print 'Reading %s data' % (n)
data = data_io.read_data(train_path, n)
submission_data = data_io.read_data(submission_path, 10000) # use all
contents = get_doc_contents(data[:, 2])
contents = [ features.normalize(content) for content in contents ]
Y = data[:, -1].astype(int)
bestwords = get_bestwords(contents, Y, 100000, n)
submission_contents = get_doc_contents(submission_data[:, 2])
submission_contents = [ features.normalize(submission_content) for submission_content in submission_contents ]
X_submission_ids = submission_data[:, 1]
v = TfidfVectorizer(min_df = 2, binary = True, norm = 'l2', smooth_idf = True, sublinear_tf = True, strip_accents = 'unicode', vocabulary = bestwords, ngram_range = (1,2))
X = v.fit_transform(contents)
X_submission = v.transform(submission_contents)
del data
del submission_data
meta_vector = get_meta_vector(tags, contents)
del tags
del contents
meta_vector_2 = get_meta_vector_2(custom_contents)
del custom_contents
X = sparse.hstack((X, meta_vector, meta_vector_2))
print 'train vector ready'
print 'Final X shape : %s' % (str(X.shape))
submission_meta_vector = get_meta_vector(submission_tags, submission_contents)
del submission_tags
del submission_contents
submission_meta_vector_2 = get_meta_vector_2(submission_custom_contents)
del submission_custom_contents
X_submission = sparse.hstack((X_submission, submission_meta_vector, submission_meta_vector_2))
print 'submission vector ready'
print datetime.datetime.now()
c = Pipeline([
('fe', TruncatedSVD(n_components = 400)),
('clf', LogisticRegression(tol = 0.0001, dual = True, penalty = 'l2')),
])
cv_scores = cross_val_score(c, X, Y, cv = 5, scoring = 'roc_auc')
print '%s | %s' % (np.mean(cv_scores), cv_scores)
c.fit(X, Y)
Y_submission_pred = c.predict_proba(X_submission)[:, 1]
f = open('submission/simple_20.csv', 'w')
f.write("urlid,label\n")
for i in xrange(len(X_submission_ids)):
url_id = X_submission_ids[i]
label = Y_submission_pred[i]
f.write("%s,%s\n" % (url_id, label))
f.close()
print datetime.datetime.now()