-
Notifications
You must be signed in to change notification settings - Fork 0
/
comment_crawler.py
457 lines (394 loc) · 18.3 KB
/
comment_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
#coding:utf8
"""
根据已经抓取到的每个小组的topic列表,针对具体的每个topic抓取评论
主题框架于topic_crawler.py相似
"""
from urlparse import urljoin,urlparse
from collections import deque
import traceback
import logging
import time
from datetime import datetime
import pdb
import codecs # for file encodings
import os
import sys
from bs4 import BeautifulSoup
from webPage import WebPage
from threadPool import ThreadPool
from patterns import *
from models import Topic
from database import Database
from logconfig import congifLogger
import stacktracer
log = logging.getLogger('Main.CommentCrawler')
class CommentCrawler(object):
def __init__(self, group_id, topic_id_list, thread_num, base_path, topic_info_path, comment_info_path):
"""
`group_id` 当前的Group id
`topic_id_list` 需要抓取的topic id的list
`thread_num` 开启的线程数目
`topic_info_path` 存储topic信息的文件
`comment_info_path` 存储comment信息的文件
"""
#线程池,指定线程数
self.thread_pool = ThreadPool(thread_num)
# 由于现在是将不同的topic信息保存到不同的文件中,所以可以同时存储
self.save_thread = ThreadPool(10)
self.topic_info_path = topic_info_path
self.comment_info_path = comment_info_path
self.base_path = base_path
# 已经访问的页面: Group id ==> True or False
self.visited_href = set()
# 抓取失败的topic id
self.failed = set()
# 依次为每个小组抽取topic评论
self.group_id = group_id
self.topic_id_list = topic_id_list # 等待抓取的topic列表
# 存储结果
# topic ID ==> Topic对象
self.topic_dict = dict()
# 存放下一个处理的评论页数: topic ID ==> 1,2,3...
self.next_page = dict()
# 已经抓取完毕的topic id集合
self.finished = set()
self.is_crawling = False
# 每个topic抓取的最多comments个数
#self.MAX_COMMETS_NUM = 5000
self.MAX_COMMETS_NUM = float('inf')
# 每页的评论数量
self.COMMENTS_PER_PAGE = 100
def start(self):
print '\nStart Crawling comment list for group: ' + self.group_id + '...\n'
self.is_crawling = True
self.thread_pool.startThreads()
self.save_thread.startThreads()
# 打开需要存储的文件
self.topic_info_file = codecs.open(self.topic_info_path, 'w', 'utf-8')
self.comment_info_file = codecs.open(self.comment_info_path, 'w', 'utf-8')
self.topic_id_list = list(set(self.topic_id_list)) # 消除重复的topic id
print "Total topics in group %s: %d." % (self.group_id, len(self.topic_id_list))
# 初始化添加任务
for topic_id in self.topic_id_list:
url = "http://www.douban.com/group/topic/" + topic_id + "/"
self.thread_pool.putTask(self._taskHandler, url)
# 下一页评论类似:http://www.douban.com/group/topic/35082953/?start=100
self.next_page[topic_id] = 1
# 完全抛弃之前的抽取深度的概念,改为随时向thread pool推送任务
while True:
# 保证任何时候thread pool中的任务数为线程数的2倍
print "Check threalPool queue..."
while self.thread_pool.getTaskLeft() < self.thread_pool.threadNum * 2:
# 获取未来需要访问的链接
url = self._getFutureVisit()
if url is not None:
self.thread_pool.putTask(self._taskHandler, url)
else: # 已经不存在下一个链接
break
# 每隔一秒检查thread pool的队列
time.sleep(2)
# 检查是否处理完毕
if len(self.finished) == len(self.topic_id_list):
break
elif len(self.finished) > len(self.topic_id_list):
assert(False)
print 'Total topics: %d, Finished topic: %d' % (len(self.topic_id_list), len(self.finished))
remain = set(self.topic_id_list) - self.finished
if len(remain) < 5:
print 'Unfinished: ', remain
# 等待线程池中所有的任务都完成
print "Totally visited: ", len(self.visited_href)
#pdb.set_trace()
while self.thread_pool.getTaskLeft() > 0:
print "Task left in threadPool: ", self.thread_pool.getTaskLeft()
print "Task queue size: ", self.thread_pool.taskQueue.qsize()
print "Running tasks: ", self.thread_pool.running
time.sleep(2)
# 检查保存线程完成情况
while self.save_thread.getTaskLeft() > 0:
print "Task left in save thread: ", self.save_thread.getTaskLeft()
print "Task queue size: ", self.save_thread.taskQueue.qsize()
print "Running tasks: ", self.save_thread.running
time.sleep(2)
# 记录抓取失败的topic id
log.info('抓取失败的topic id:')
s = ''
for topic_id in self.failed:
s += (topic_id + '\n')
log.info('\n' + s)
print "Terminating all threads..."
self.stop()
assert(self.thread_pool.getTaskLeft() == 0)
self.topic_info_file.close()
self.comment_info_file.close()
print "Main Crawling procedure finished!"
print "Start to save result..."
#self._saveCommentList()
#self._saveComment2file()
log.info("Processing done with group: %s" % (self.group_id))
def stop(self):
self.is_crawling = False
self.thread_pool.stopThreads()
self.save_thread.stopThreads()
def _taskHandler(self, url):
""" 根据指定的url,抓取网页,并进行相应的访问控制
"""
print "Visiting : " + url
webPage = WebPage(url)
# 抓取页面内容
flag = webPage.fetch()
match_obj = RETopic.match(url)
match_obj2 = REComment.match(url)
if flag:
if match_obj is not None:
topic_id = match_obj.group(1)
topic = Topic(topic_id, self.group_id)
comment_list = topic.parse(webPage, isFirstPage = True) # First page parsing
self.topic_dict[topic_id] = topic
# 保存到单个文件(已废弃不用)
#self.save_thread.putTask(self._saveHandler, comment_list, topic = topic)
elif match_obj2 is not None:
topic_id = match_obj2.group(1)
start = int(match_obj2.group(2))
# 抽取非第一页的评论数据
if topic_id in self.topic_dict:
topic = self.topic_dict[topic_id]
if topic is None:
log.error('未知程序错误:结束topic id为%s的抽取,释放内存。' % topic_id)
self.topic_dict[topic_id] = None
return False
else:
# 这里的含义为:必须先处理第一页的评论,否则该topic_id不会作为self.topic_dict的键出现
log.error('错误:必须先抽取第一页的评论数据:topic id: %s' % topic_id)
self.failed.add(topic_id)
self.finished.add(topic_id)
return False
comment_list = topic.parse(webPage, isFirstPage = False) # non-firstpage parsing
# 保存到单个文件(已废弃不用)
#self.save_thread.putTask(self._saveHandler, comment_list, topic = None)
else:
#pdb.set_trace()
log.info('Topic链接格式错误:%s in Group: %s.' % (url, self.group_id))
# 判断抓取是否结束,如果结束,则释放dict内存
# 这个很重要,因为随着topic数量增多,内存会占很多
if topic.isComplete():
self.save_thread.putTask(self._saveTopicHandler, self.topic_dict, topic_id)
#self.topic_dict[topic_id] = None # 释放资源
self.finished.add(topic_id)
log.info('Topic: %s 抓取结束。' % topic_id)
self.visited_href.add(url)
return True
else:
# 处理抓取失败的网页集合
# 只要一个网页抓取失败,则加入到finished
if match_obj is not None:
# 讨论贴的第一页就没有抓到,则将其列入finished名单中
topic_id = match_obj.group(1)
elif match_obj2 is not None:
topic_id = match_obj2.group(1)
start = int(match_obj2.group(2))
else:
log.info('Topic链接格式错误:%s in Group: %s.' % (url, self.group_id))
# 添加抓取失败的topic id和标记抓取结束的topic
self.failed.add(topic_id)
self.finished.add(topic_id) # 有可能已经记录了一些某些topic的信息
self.visited_href.add(url)
return False
def _saveTopicHandler(self, topic_dict, topic_id):
""" 存储抓取完毕的帖子信息以及其对应的Comment。
不过,跟_saveHandler函数不同的是,这里是按照topic id存储
@topic_dict 存储topic信息的字典
@topic_id 需要存储的topic id
"""
# 对评论进行排序,并查找quote comment
topic = topic_dict[topic_id]
topic.sortComment()
topic_path = self.base_path + group_id + '/' + topic_id + '-info.txt'
# 存储topic本身的信息
f = codecs.open(topic_path, 'w', 'utf-8')
s = topic.getSimpleString('[=]')
f.write(s + '\n')
#f.write('[*ROWEND*]')
# 存储comment信息,存储到相同的文件中
for comment in topic.comment_list:
s = comment.getSimpleString('[=]')
#f.write(s + '\n[*ROWEND*]\n')
f.write(s + '\n')
f.close()
self.topic_dict[topic_id] = None # 释放资源
log.info("Topic: %s 存储结束" % topic_id)
def _getFutureVisit(self):
"""根据当前的访问情况,获取下一个要访问的网页
"""
for topic_id in self.topic_dict:
if topic_id in self.finished:
continue
topic = self.topic_dict[topic_id]
if topic is None:
continue
if topic.max_comment_page <= 0:
# 还未处理该topic的首页
continue
elif topic.max_comment_page == 1:
# 该topic只有首页有评论
continue
else:
# 该topic有多页评论
next_start = self.next_page[topic_id]
url = "http://www.douban.com/group/topic/" + topic_id + "/?start=" + str(next_start * self.COMMENTS_PER_PAGE)
if next_start <= topic.max_comment_page-1:
self.next_page[topic_id] = next_start + 1
return url
else:
continue
return None
def _getAllHrefsFromPage(self, url, pageSource):
'''解析html源码,获取页面所有链接。返回链接列表'''
hrefs = []
soup = BeautifulSoup(pageSource)
results = soup.find_all('a',href=True)
for a in results:
#必须将链接encode为utf8, 因为中文文件链接如 http://aa.com/文件.pdf
#在bs4中不会被自动url编码,从而导致encodeException
href = a.get('href').encode('utf8')
if not href.startswith('http'):
href = urljoin(url, href)#处理相对链接的问题
hrefs.append(href)
return hrefs
def _isHttpOrHttpsProtocol(self, href):
protocal = urlparse(href).scheme
if protocal == 'http' or protocal == 'https':
return True
return False
def _getAlreadyVisitedNum(self):
#visitedGroups保存已经分配给taskQueue的链接,有可能链接还在处理中。
#因此真实的已访问链接数为visitedGroups数减去待访问的链接数
if len(self.visited_href) == 0:
return 0
else:
return len(self.visited_href) - self.thread_pool.getTaskLeft()
'''
def _saveHandler(self, comment_list, topic):
""" 将topic信息和comemnt信息保存到文件中
注意:这里将topic信息和comment信息分开存储
注意:这里每一行存储一个topic或者comment,不再使用ROWEND符号作为结束
Note: 使用此方式保存comment quote会产生错误。
"""
# 先保存comment_list id
# 判断是否是第一次保存该topic
if topic != None: # 如果是第一次保存,则需要保存topic的基本信息
s = topic.getSimpleString('[=]')
#self.topic_info_file.write(s + '\n[*ROWEND*]\n')
self.topic_info_file.write(s + '\n')
# 保存comment信息
for comment in comment_list:
s = comment.getSimpleString('[=]')
#self.comment_info_file.write(s + '\n[*ROWEND*]\n')
self.comment_info_file.write(s + '\n')
# 保证已经写入到磁盘上,这样可以随时终止
self.topic_info_file.flush()
os.fsync(self.topic_info_file) # The method fsync() forces write of file with file descriptor fd to disk
self.comment_info_file.flush()
os.fsync(self.comment_info_file)
def _saveCommentList(self):
"""将抽取的结果存储在文件中,包括存储topic内容和评论内容
Note: 这次是将存储过程放在主线程,将会阻塞抓取过程
NOTE: 此函数已经不再使用
"""
# 如果不存在目录,则创建它
path = "data/" + self.group_id + "/"
if not os.path.exists(path):
os.mkdir(path)
for topic_id in self.topic_id_list:
topic = self.topic_dict[topic_id]
path = "data/" + self.group_id + "/" + topic_id + ".txt"
f = codecs.open(path, "w", "utf-8", errors='replace')
f.write(topic.__repr__())
f.close()
# save the failed hrefs
f = open("data/"+self.group_id+"/failed.txt", "w")
for href in self.failed:
f.write(href + "\n")
f.close()
# write comment structures
path = "structure/" + self.group_id + "/"
if not os.path.exists(path):
os.mkdir(path)
for topic_id in self.topic_dict:
atopic = self.topic_dict[topic_id]
path = "structure/" + self.group_id + "/" + topic_id + ".txt"
f = codecs.open(path, "w", "utf-8", errors='replace')
# 每一行:评论id,评论用户id,(引用评论id,引用评论的用户id)
for comment in atopic.comment_list:
f.write(comment.cid + " " + comment.user_id)
if comment.quote is not None:
f.write(" " + comment.quote.cid + " " + comment.quote.user_id)
f.write("\n")
f.close()
def _saveComment2file(self):
""" 直接将抓取结果存入文件中
"""
ftopic = open(self.topic_info_path, 'w')
fcomment = open(self.comment_info_path , 'w')
for topic_id in self.topic_dict:
topic = self.topic_dict[topic_id]
s = topic.getSimpleString(delimiter = '[=]')
ftopic.write(s + '\n[*ROWEND*]\n')
for comment in topic.comment_list:
cs = comment.getSimpleString(delimiter = '[=]')
fcomment.write(cs + '\n[*ROWEND*]\n')
ftopic.close()
fcomment.close()
'''
if __name__ == "__main__":
LINE_FEED = "\n" # 采用windows的换行格式
stacktracer.trace_start("trace.html",interval=5,auto=True) # Set auto flag to always update file!
congifLogger("log/comment_crawler.log", 5)
#group_id_list = ['FLL', '294806', 'MML']
#group_id_list = ['test']
#group_id_list = ['70612', 'FLL']
group_id_list = []
if len(sys.argv) <= 1:
print "Group IDs were not provided."
sys.exit()
# add group ids
for i in range(1, len(sys.argv)):
group_id_list.append(sys.argv[i])
print "Crawling comments for groups: ", group_id_list
MAX_TOPIC_NUM = float('inf') # 每个小组最多处理的topic的个数
group_id = group_id_list[0]
"""
for index in range(8,9):
# 读取topic列表
f = open('tables/ustv/TopicList-ustv-remain-' + str(index), 'r')
#f = open('tables/ustv/topic-list-test.txt', 'r')
topic_list = []
for line in f:
line = line.strip()
if line is not "":
topic_list.append(line)
#if len(topic_list) >= MAX_TOPIC_NUM:
# break
f.close()
time_now = datetime.now()
topic_path = 'tables/' + group_id + '/TopicInfo-' + group_id + '-' + str(time_now) + '-raw-' + str(index)
comment_path = 'tables/' + group_id + '/CommentInfo-' + group_id + '-' + str(time_now) + '-raw-' + str(index)
comment_crawler = CommentCrawler(group_id, topic_list, 5, topic_path, comment_path)
comment_crawler.start()
"""
base_path = '/home/kqc/dataset/douban-group/'
# 抓取insidestory
f = open(base_path + 'TopicList-' + group_id + '.txt', 'r')
topic_list = []
for line in f:
line = line.strip()
if line is not "":
topic_list.append(line)
f.close()
#time_now = datetime.now()
topic_path = '/home/kqc/dataset/douban-group/' + group_id + '/TopicInfo-' + group_id + '-raw-all.txt'
comment_path = '/home/kqc/dataset/douban-group/' + group_id + '/CommentInfo-' + group_id + '-raw-all.txt'
comment_crawler = CommentCrawler(group_id, topic_list, 5, base_path, topic_path, comment_path)
comment_crawler.start()
print "Done"
stacktracer.trace_stop()