forked from yaojialyu/crawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
prepare.py
127 lines (109 loc) · 4.05 KB
/
prepare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#encoding=utf-8
"""
这里包含了一些准备数据集需要用到的函数和变量
"""
from datetime import datetime
import codecs
import logging
from utils import is_between
from logconfig import congifLogger
# config logging
log = logging.getLogger('Main.prepare')
congifLogger("prepare.log", 5)
# 训练集的起止时间
TRAIN_START_DATE = datetime(2012, 8, 1)
TRAIN_END_DATE = datetime(2012, 11, 1)
TEST_START_DATE = datetime(2012, 11, 1)
TEST_END_DATE = datetime(2013, 1, 1)
# 设置最早和最晚的年限
VERY_EARLY_TIME = datetime(1900, 1, 1)
VERY_LATE_TIME = datetime(2050, 1, 1)
def load_topic_user(filepath, start_date = VERY_EARLY_TIME, end_date = VERY_LATE_TIME):
""" 根据时间范围,导入所有的topic以及参与的user id
注意:topic可能有commentlist或者没有
"""
print 'Loading topic from %s' % filepath
f = codecs.open(filepath, 'r', 'utf-8')
# map topic_id --> dict()
topic_dict = dict()
user_set = set()
count = 0
for line in f:
line = line.strip()
seg_list = line.split('[=]')
if len(seg_list) < 6:
log.info('Bad formatted topic: %s' % line)
count += 1
continue
#print 'Processing topic id: %s, group id: %s' % (seg_list[0], seg_list[1])
pubdate = datetime.strptime(seg_list[3], "%Y-%m-%d %H:%M:%S")
if not is_between(pubdate, start_date, end_date):
continue
# 记录下该topic信息
topic = dict()
topic['topic_id'] = seg_list[0]
topic['group_id'] = seg_list[1]
topic['user_id'] = seg_list[2]
topic['pubdate'] = pubdate
topic['title'] = seg_list[4]
topic['content'] = seg_list[5]
user_set.add(topic['user_id'])
# 去掉最后的逗号
if len(seg_list) == 7: # 如果包含comment_list
s = seg_list[6]
if s != '' and s[-1] == ',':
seg_list[6] = s[0:-1]
topic['comment_list'] = seg_list[6]
else:
topic['comment_list'] = ''
topic_dict[topic['topic_id']] = topic
#print "Loaded topic: " + topic[topic_id]
log.info('Number of bad formatted topic: %d' % count)
f.close()
return topic_dict, user_set
def load_comment_user(filepath, topic_dict, start_date = VERY_EARLY_TIME, end_date = VERY_LATE_TIME):
""" 根据时间范围,导入所有的评论id,tpic id和内容
注意:在这里仍然需要topic_dict,因为只有在topic_dict中的comment才会被收集
"""
print 'Loading comment from %s' % filepath
f = codecs.open(filepath, 'r', 'utf-8')
comment_dict = dict()
user_set = set()
count = 0
for line in f:
line = line.strip()
seg_list = line.split('[=]')
if len(seg_list) != 7:
log.info('Bad formatted comment: %s' % line)
count += 1
continue
#print 'Processing comment id: %s, group id: %s, topic id: %s' % (seg_list[0], seg_list[1], seg_list[2])
pubdate = datetime.strptime(seg_list[4], "%Y-%m-%d %H:%M:%S")
topic_id = seg_list[2]
if topic_id in topic_dict and is_between(pubdate, start_date, end_date):
pass
else:
continue
comment = dict()
comment['comment_id'] = seg_list[0]
comment['group_id'] = seg_list[1]
comment['topic_id'] = seg_list[2]
comment['user_id'] = seg_list[3]
user_set.add(comment['user_id'])
pubdate = datetime.strptime(seg_list[4], "%Y-%m-%d %H:%M:%S")
comment['pubdate'] = pubdate
comment['ref_comment_id'] = seg_list[5]
comment['content'] = seg_list[6]
comment_dict[comment['comment_id']] = comment
log.info('Number of bad formatted comment: %d' % count)
return comment_dict, user_set
def load_user_list(user_path):
""" 从文件中读入用户uid列表
"""
f = open(user_path)
ulist = []
for uid in f:
uid = uid.strip()
ulist.append(uid)
f.close()
return ulist