-
Notifications
You must be signed in to change notification settings - Fork 1
/
hot_word.py
566 lines (494 loc) · 17.7 KB
/
hot_word.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import time
import datetime
import random
import pickle
from bitarray import bitarray
import re
import os
import threading
import jieba
import redis
import math
import ujson
import cPickle
import heapq
import copy
import leveldb
import fast_search
import c_tf_idf_tk
import tf_idf_config
class cppjieba:
def __init__(self, dict_path = "jieba.dict.utf8", hmm_path = "hmm_model.utf8"):
self.dict_path = dict_path
self.hmm_path = hmm_path
self.inited = 0
self.seg_hd = 0
def initialize(self):
if not self.inited:
self.seg_hd = c_tf_idf_tk.cppjieba_init(self.dict_path, self.hmm_path)
self.inited = 1
def cut(self, s):
if not self.inited:
self.initialize()
return c_tf_idf_tk.cppjieba_cut(self.seg_hd, s)
def __del__(self):
c_tf_idf_tk.cppjieba_close(self.seg_hd)
'''
独立出idf类, 如果某个词不在idf词典和停用词词典中 应给予高idf
每篇文章分出词的前n%作为文章特征词, 依次记录到词dict中
每个分类一个对象, 指定不同的leveldb
首先实现热词记录类
'........'字符串分词结果会以其本身作为一个词
以停用词作为词典, 对idf类中没有的词做fast_search
如果找到 则作为垃圾词,否则调高idf值
如果找到如: t.cn/url.cn/dwz.cn类的短网址domain
首先使用正则替换掉"http://t.cn/XXXXXX"
防止XXXXX切出词, 被当做高idf值的词, 占用词典空间
因为微博数据很多,不加此过滤会导致词典大小暴增
\b(http:\/\/)?[\w\d]+\.[\w\d\.]+\/[\w\d_!@#$%^&\*-_=\+]+
'''
class train_idf:
def __init__(self, dics = {}):
self.word_dic = dics
self.fcounter = 0
self.default_idf = 10
self.log_base = math.e
self.rubbish_set, self.rubbish_hd = self.get_rubbish_set()
cur_path = os.path.dirname(__file__)
dict_path = tf_idf_config.dict_path
hmm_path = tf_idf_config.hmm_path
self.seg_hd = cppjieba(dict_path, hmm_path)
def get_rubbish_set(self, stopword_path = "stopwords.txt"):
rubbish_set = set()
hd = 0
try:
hd = fast_search.load(stopword_path)
with open(stopword_path, "r") as fd:
for l in fd:
rubbish_set.add(l.strip())
except:
hd = 0
#如果读不到文件则忽略
pass
rubbish_set.add(" ")
rubbish_set.add(" ")
rubbish_set.add("\t")
rubbish_set.add("\r")
rubbish_set.add("\n")
rubbish_set.add("\r\n")
rubbish_set.add("DC")
rubbish_set.add("DS")
rubbish_set.add("gt")
return rubbish_set, hd
def set_rubbish_set(self, rubbish_set):
self.rubbish_set = rubbish_set
def is_rubbish(self, word):
if word in self.rubbish_set:
return 1
if self.word_dic.has_key(word):
return 0
result = fast_search.findall(self.rubbish_hd, word)
if result:
return 1
else:
return 0
def set_log_base(self, log_base):
"""
计算idf值log的底默认为math.e
此函数用于更改默认的底
"""
self.log_base = log_base
def set_default_idf(self, idf):
"""
对不存在的词idf值默认返回10
此函数用于更改默认的idf返回值
"""
self.default_idf = idf
def add_doc(self, s):
word_set = set(self.seg_hd.cut(s))
for word in word_set:
if not self.word_dic.has_key(word):
self.word_dic[word] = 0
self.word_dic[word] += 1
self.fcounter += 1
def get_idf(self, word):
"""
如果此word不存在 则返回default
使用set_default_idf(idf) 设置默认idf值
"""
if word in self.rubbish_set:
return math.log(float(self.fcounter)/self.word_dic[word], self.log_base)
elif word not in self.word_dic:
return self.default_idf
else:
return math.log(float(self.fcounter)/self.word_dic[word], self.log_base)
def loads(self, s):
dics = pickle.loads(s)
self.set_vars_by_dics(dics)
def dumps(self):
dics = self.gen_dics()
s = cPickle.dumps(dics)
return s
def save_idf_by_fpath(self, fpath):
with open(fpath, "w") as fd:
self.save_idf_by_fd(fd)
def save_idf_by_fd(self, fd):
idf_list = []
for w in self.word_dic:
lidf = self.get_idf(w)
idf_list.append((lidf, w))
idf_list = heapq.nlargest(len(idf_list), idf_list)
for i in idf_list:
fd.write(i[1] + "\t" + str(i[0]) + "\n")
def set_vars_by_dics(self, dics):
self.word_dic = dics["word_dic"]
self.fcounter = dics["fcounter"]
self.default_idf = dics["default_idf"]
self.log_base = dics["log_base"]
self.rubbish_set = dics["rubbish_set"]
def gen_dics(self):
dics = {}
dics["word_dic"] = self.word_dic
dics["fcounter"] = self.fcounter
dics["default_idf"] = self.default_idf
dics["log_base"] = self.log_base
dics["rubbish_set"] = self.rubbish_set
return dics
class idf:
def __init__(self):
self.idf_dic = {}
self.stopword_set = set()
self.default_idf = tf_idf_config.default_idf
def load_idf(self, idf_path):
with open(idf_path) as fd:
for l in fd:
idx = l.find("\t")
if idx <= 0:
continue
word = l[:idx]
if not word:
continue
if l[idx + 1:]:
idf_v = float(l[idx + 1:])
else:
idf_v = 0
self.idf_dic[word] = idf_v
def load_stopwords(self, stopword_path):
with open(stopword_path) as fd:
for l in fd:
self.stopword_set.add(l.strip())
self.stopword_set.add(" ")
self.stopword_set.add("\t")
self.stopword_set.add("\n")
self.stopword_set.add("\r")
self.stopword_set.add("\r\n")
def load(self, idf_path, stopword_path):
self.load_idf(idf_path)
self.load_stopwords(stopword_path)
def get_idf(self, w):
return self.idf_dic.get(w, self.default_idf)
def is_rubbish(self, w):
return w in self.stopword_set
class hot_word:
idf_hd = None
seg_hd = None
short_url_hd = None
url_re = None
def __init__(self):
if not hot_word.idf_hd:
hot_word.idf_hd = idf()
hot_word.idf_hd.load(tf_idf_config.idf_dumps_path, tf_idf_config.stopwords_path)
if not hot_word.seg_hd:
hot_word.seg_hd = cppjieba(tf_idf_config.dict_path, tf_idf_config.hmm_path)
if not hot_word.short_url_hd:
hot_word.short_url_hd = fast_search.load(tf_idf_config.short_url_path)
if not hot_word.url_re:
hot_word.url_re = re.compile(r'(http:\/\/)*[\w\d]+\.[\w\d\.]+\/[\w\d_!@#$%^&\*-_=\+]+')
self.hot_word_dic = {}
self.get_file_word_flag = "num"
self.word_list_n = 5
self.get_file_word_cbk = {}
self.get_file_word_cbk["num"] = self.get_file_word_list_by_num
self.get_file_word_cbk["percent"] = self.get_file_word_list_by_persent
def clear(self):
self.hot_word_dic.clear()
def set_options(self, option_dic):
'''
get_word_flag:('num', 'percent') 表示使用哪种方式取文章特征词
'num'取tf-idf排名前 n的词,
'percent' 取tf-idf排名前 n%的词
'word_top_num':[1, int_max) 如果使用方式 'num' n的值
'word_top_persent':[1, 100] 如果使用方式 'percent' n的值
'''
self.get_file_word_flag = option_dic.get("get_word_flag", "num")
self.word_list_num = option_dic.get("word_top_num", 5)
self.word_list_persent = option_dic.get("word_top_persent", 10)
if self.get_file_word_flag == "percent":
self.word_list_n = self.word_list_persent
elif self.get_file_word_flag == "num":
self.word_list_n = self.word_list_num
else:
self.word_list_n = 0
def dumps(self):
dics = {}
dics["hot_word_dic"] = pickle.dumps(self.hot_word_dic)
self.write_and_clean_batch()
s = pickle.dumps(dics)
return s
def loads(self, s):
dics = pickle.loads(s)
hot_word.idf_hd = idf()
hot_word.idf_hd.load(tf_idf_config.idf_path, tf_idf_config.stopwords_path)
self.hot_word_dic = pickle.loads(dics["hot_word_dic"])
def __isub__(self, hw_hd):
d = hw_hd.hot_word_dic
for word in d:
self.hot_word_dic[word] = self.hot_word_dic.get(word, 0)
self.hot_word_dic[word] -= d[word]
if not self.hot_word_dic[word]:
del self.hot_word_dic[word]
return self
def __iadd__(self, hw_hd):
d = hw_hd.hot_word_dic
for word in d:
self.hot_word_dic[word] = self.hot_word_dic.get(word, 0)
self.hot_word_dic[word] += d[word]
def get_file_word_list_by_num(self, s, n):
'''
获取文章tf-idf值top_n的词列表
'''
word_list = hot_word.seg_hd.cut(s)
return self.get_file_word_list_base(word_list, n)
def get_file_word_list_by_persent(self, s, n):
'''
获取文章tf-idf值top n%的词列表
'''
word_list = hot_word.seg_hd.cut(s)
#使用去重后的词数
word_num = int(len(set(word_list)) * n * 0.01)
if not word_num:
word_num = 1
#print "%d:%d " % (word_num, len(word_list))
return self.get_file_word_list_base(word_list, word_num)
#@profile
def get_file_word_list_base(self, word_list, n):
'''
传入分好的文章词列表
'''
#l = word_list[:n]
#ret = []
#for i in l:
# ret.append((1, i))
#return ret
l_word_dic = {}
for word in word_list:
if hot_word.idf_hd.is_rubbish(word):
continue
if not l_word_dic.has_key(word):
l_word_dic[word] = 0
l_word_dic[word] += 1
ret_list = []
for word in l_word_dic:
tf_idf = l_word_dic[word] * hot_word.idf_hd.get_idf(word)
ret_list.append((tf_idf, word))
l = heapq.nlargest(n, ret_list)
return l
def s_filter(self, s):
result = fast_search.findall(hot_word.short_url_hd, s)
if result:
s = hot_word.url_re.sub("", s)
return s
#此doc不再加到idf中, idf不再变化
def add_doc(self, fname):
try:
with open(fname, "r") as fd:
s = fd.read()
except:
return -1
self.add_doc_s(s)
#@profile
def add_doc_word_list(self, word_list):
ret_list = self.get_file_word_list_base(word_list, self.word_list_n)
self.add_word_to_dic(ret_list)
def add_doc_word_list_many(self, word_list, hwhd_list):
'''
hwhd_list需要包含自身
'''
ret_list = self.get_file_word_list_base(word_list, self.word_list_n)
for hwhd in hw_list:
hwhd.add_word_to_dic(ret_list)
def get_word_list(self, s):
s = self.s_filter(s)
word_list = self.seg_hd.cut(s)
return word_list
def add_doc_s(self, s):
s = self.s_filter(s)
word_list = self.get_file_word_cbk[self.get_file_word_flag](s, self.word_list_n)
self.add_word_to_dic(word_list)
#@profile
def add_word_to_dic(self, word_list):
for w in word_list:
try:
word = w[1]
except:
exit()
if not self.hot_word_dic.has_key(word):
self.hot_word_dic[word] = 0
self.hot_word_dic[word] += 1
#不带频数的word_list
def get_top_n_word_list(self, n):
l = self.get_top_n_word_list_with_freq(n)
return [i[1] for i in l]
#带频数的word_list
def get_top_n_word_list_with_freq(self, n):
hot_word_list = []
for word in self.hot_word_dic:
hot_word_list.append((self.hot_word_dic[word], word))
l = heapq.nlargest(n, hot_word_list)
return l
#无需外界管理数组头
#只需append,loop_list类会管理长度及数组头
#__init__(self, 长度, 类型)
class loop_list:
def __init__(self, list_len, T):
self.T = T
self.is_full = 0
self.list_len = list_len
self.cur_idx = 0
self.head_idx = 0
self.l = []
for i in range(list_len):
self.l.append(T())
def __getitem__(self, idx):
if idx >= self.list_len or idx < 0:
raise IndexError
else:
return self.l[(self.head_idx + idx) % self.list_len]
def __setitem__(self, idx, v):
if idx >= self.list_len or idx < 0:
raise IndexError
else:
self.l[(self.head_idx + idx) % self.list_len] = v
def __len__(self):
return self.list_len
def move_next(self):
'''
cur_idx指向下一个
'''
tmp_idx = (self.cur_idx + 1) % self.list_len
if tmp_idx < self.cur_idx:
self.is_full = 1
self.cur_idx = tmp_idx
if self.is_full:
self.head_idx = (self.head_idx + 1) % self.list_len
if hasattr(self.l[self.cur_idx], 'clear'):
self.l[self.cur_idx].clear()
else:
self.l[self.cur_idx] = self.T()
def to_list(self):
ret_list = []
i = 0
while ((self.head_idx + i) % self.list_len) != self.cur_idx:
ret_list.append(self.__getitem__(i))
i += 1
def end(self):
'''
返回最后一个元素的索引, 外部使用
'''
if self.is_full:
return self.list_len - 1
else:
return self.cur_idx
def __del__(self):
pass
#内部使用循环数组做hot_word对象管理
#todo: 如果性能过低,尝试使用同时加到min,max里,如果min过期,则从max里减去min
class hot_event:
rhd = None
def __init__(self, pre_fix):
self.pre_fix = pre_fix
self.init_time = time.time()
self.cur_time = self.init_time
self.min_interval = tf_idf_config.min_interval
self.max_interval = tf_idf_config.max_interval
#此处实例化hot_word类后类属性会初始化,loop_list创建对象则不会再费时间
self.max_hd = hot_word()
self.hd_list = loop_list(int(self.max_interval / self.min_interval), hot_word)
if not hot_event.rhd:
hot_event.rhd = redis.Redis(host = tf_idf_config.redis_host, port = tf_idf_config.redis_port, db = tf_idf_config.redis_db)
self.thread_lock = threading.Lock()
t = threading.Thread(target=self.refresh, args=())
t.setDaemon(True)
t.start()
def refresh(self):
while 1:
t = time.time()
self.thread_lock.acquire()
self.store_word_lists()
if self.hd_list.is_full:
self.max_hd -= self.hd_list[0]
self.hd_list.move_next()
self.cur_time = t
self.thread_lock.release()
time.sleep(tf_idf_config.min_interval)
def add_doc_s(self, s):
word_list = self.max_hd.get_word_list(s)
self.thread_lock.acquire()
self.hd_list[self.hd_list.end()].add_doc_word_list(word_list)
self.max_hd.add_doc_word_list(word_list)
self.thread_lock.release()
def add_doc_s_many(self, s, hehd_list):
'''
hehd_list需要包含自身
'''
word_list = self.max_hd.get_word_list(s)
#全部加锁
for hehd in hehd_list:
hehd.thread_lock.acquire()
hwhd_list = []
for hehd in hehd_list:
hwhd_list.append(hehd.hd_list[hehd.hd_list.end()])
hwhd_list.append(hehd.max_hd)
self.max_hd.add_doc_word_list_many(word_list, hwhd_list)
for hehd in hehd_list:
hehd.thread_lock.release()
def get_top_n_word_list_with_freq(self, n):
'''
返回tuple
元素1为当前min_interval中热门词
元素2为当前max_interval中热门词
'''
min_word_list = self.hd_list[self.hd_list.end()].get_top_n_word_list_with_freq(n)
max_word_list = self.max_hd.get_top_n_word_list_with_freq(n)
return (min_word_list, max_word_list)
def get_top_n_word_list(self, n):
'''
返回tuple
元素1为当前min_interval中热门词
元素2为当前max_interval中热门词
'''
freq_word_lists = self.get_top_n_word_list_with_freq(n)
return ([i[1] for i in freq_word_lists[0]], [i[1] for i in freq_word_lists[1]])
def store_word_lists(self):
'''
生成min,max word_list, 并存储
'''
print id(hot_event.rhd)
word_lists = self.get_top_n_word_list(tf_idf_config.word_list_len)
dt = datetime.datetime.now()
min_key = "%s:%s" % (self.pre_fix, tf_idf_config.min_interval_key)
min_json = {}
min_json["words"] = word_lists[0]
min_json['ctime'] = dt
min_json_str = ujson.dumps(min_json)
hot_event.rhd.rpush(min_key, min_json_str)
max_key = "%s:%s" % (self.pre_fix, tf_idf_config.max_interval_key)
max_json = {}
max_json["words"] = word_lists[1]
max_json['ctime'] = dt
max_json_str = ujson.dumps(max_json)
hot_event.rhd.rpush(max_key, max_json_str)
def __del__(self):
pass