/
zhihucrawl.py
495 lines (457 loc) · 21.1 KB
/
zhihucrawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
# coding: utf-8
from pyquery import PyQuery as pq
import urllib, urllib2
import os, time, datetime, random, shutil, threading
import re, string, Queue
import gzip, zlib
import cookielib
class Pack():
finishedThreadNum = 0
class ImageThread(threading.Thread):
def __init__(self, lock, threadName, g_queue, p):
super(ImageThread, self).__init__(name=threadName) # 注意:一定要显式的调用父类的初始化函数。
self.lock = lock
self.queue = g_queue
self.pack = p
def run(self):
while True:
self.lock.acquire()
if self.queue.qsize() > 0:
tup = self.queue.get()
urllib.urlretrieve(tup[0], tup[1])
if self.pack.finishedThreadNum >= 0:
self.pack.finishedThreadNum += 1
self.lock.release()
break
self.lock.release()
time.sleep(1)
class ZHYear():
def __init__(self, y, m, d):
self.year = y
self.month = m
self.day = d
def newerThan(self, a):
if self.year - a.year != 0:
return self.year - a.year
if self.month - a.month != 0:
return self.month - a.month
return self.day - a.day
class ZhihuGet(object):
# 初始化
def __init__(self):
confDict = self.loadConfig()
self.targetUser = confDict['targetUser']
self.docRootDir = confDict['docRootDir']
self.sleepMin = string.atoi(confDict['sleepMin'])
self.sleepMax = string.atoi(confDict['sleepMax'])
self.oldLimit = string.atoi(confDict['oldLimit'])
self.startPage = string.atoi(confDict['startPage'])
#self.attempTimes = string.atoi(confDict['attempTimes'])
self.loginUserName = confDict['loginUserName']
self.loginPassword = confDict['loginPassword']
self.loginShowName = confDict['loginShowName']
self.dirSeparator = confDict['dirSeparator']
self.sysEncoding = confDict['sysEncoding']
self.downloadImageThread = string.atoi(confDict['downloadImageThread'])
self.alwaysGetAll = False
if confDict['alwaysGetAll'].lower() == 'true':
self.alwaysGetAll = True
self.debug = False
if confDict['debug'].lower() == 'true':
self.debug = True
self.backup = False
if confDict['backup'].lower() == 'true':
self.backup = True
self.downloadImage = False
if confDict['downloadImage'].lower() == 'true':
self.downloadImage = True
self.backupDir = "{}{}bak{}{}-{}".format(self.docRootDir, self.dirSeparator,
self.dirSeparator, self.targetUser,
time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time())))
self.saveHtmlDir = self.docRootDir + self.dirSeparator + self.targetUser
self.statusFileName = self.saveHtmlDir + self.dirSeparator + "status"
self.answerURL = 'http://www.zhihu.com/people/{}/answers'.format(self.targetUser)
self.hasMeetOld = False
self.logFileName = "{}{}trace-{}.log".format(self.docRootDir,
self.dirSeparator, self.targetUser)
self.queue = Queue.Queue()
self.lock = threading.Lock()
self.pack = Pack()
self.pack.finishedThreadNum = -1
# 设置cookie
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookielib.CookieJar()));
urllib2.install_opener(opener);
def isDebug(self):
return self.debug
# usage 信息
def usage(self):
print "PWS$./crawZhihu.py configFilePath (default ./zhihu.conf)"
# 加载参数
def loadConfig(self, confPath="zhihu.conf"):
if not os.path.exists(confPath):
print "configuration file " + confPath + " not found!"
self.usage()
return
fp = open(confPath, "r");
confDict = {}
for eachline in fp:
eachline = eachline.strip()
if eachline == '' or eachline[0] == '#':
continue
strings = eachline.strip().split("=")
confDict.setdefault(strings[0].strip(), strings[1].strip())
return confDict
# 打印log信息
def logging(self, content, force=False):
if self.isDebug() or force:
print content
logtime = time.strftime('%Y-%m-%d, %H:%M:%S', time.localtime(time.time()))
file_object = open(self.utf8ToSys(self.logFileName), "a+")
file_object.write("{} {}\n".format(logtime, content))
file_object.close()
# 查看是否已经有存档
def prepareDirs(self):
if os.path.exists(self.utf8ToSys(self.saveHtmlDir)) is False:
self.logging("dir {} not exists, creat now".format(self.saveHtmlDir), True)
os.mkdir(self.utf8ToSys(self.saveHtmlDir))
if os.path.exists(self.utf8ToSys(self.saveHtmlDir)) is False:
self.logging("creat dir {} failed, exit".format(self.saveHtmlDir), True)
exit()
self.logging("get all answers", True)
return True
else:
self.logging("update new answers", True)
return False
# 记录最后修改的时间
def tagLastModifacationToFile(self):
timenow = time.strftime('%Y-%m-%d-%H-%M', time.localtime(time.time()))
file_object = open(self.utf8ToSys(self.statusFileName), 'a')
file_object.write("updated at: " + timenow + "\r\n")
file_object.close()
# 检查登陆,并在需要时重新登录
def checkAndLogin(self):
if self.hasLogin() is False:
self.logging("not logged in, try to login", True)
content = self.login()
if self.hasLogin(content) is False:
self.logging("login failed, exit", True)
return False
return True
def buildReq(self, url, postdata=None):
req = None
loginHeaders = [("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17"),
("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"),
("Accept", "*/*"), ("X-Requested-With", "XMLHttpRequest"),
("Accept-Encoding", "gzip,deflate,sdch"),
("Accept-Language", "en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4"),
("Accept-Charset", "GBK,utf-8;q=0.7,*;q=0.3"),
("Referer", "http://www.zhihu.com")]
if postdata is None:
req = urllib2.Request(url)
else:
req = urllib2.Request(url, urllib.urlencode(postdata))
for i in loginHeaders:
req.add_header(i[0], i[1])
return req
# 如果是gzip,则解压缩
def getResponseContent(self, resp):
if "{}".format(resp.info()).find("Content-Encoding: gzip") != -1:
self.logging("page compressed using gzip, decompress it")
return zlib.decompress(resp.read(), 16 + zlib.MAX_WBITS);
else:
return resp.read()
# 登录函数
def login(self):
xsrf = self.get_xsrf()
postdata = {"email": self.loginUserName,
"password": self.loginPassword,
'_xsrf': xsrf}
# login
self.logging("logging in now, params: {}".format(postdata), True)
req = self.buildReq("http://www.zhihu.com/login", postdata)
resp = urllib2.urlopen(req)
# 抓主页
req = self.buildReq("http://www.zhihu.com")
resp = urllib2.urlopen(req)
return self.getResponseContent(resp)
# 获取登陆用的_xsrf
def get_xsrf(self):
content = urllib2.urlopen("http://www.zhihu.com").read()
str1 = content.split('_xsrf')
str2 = str1[1].split('value="')
return str2[1].split('"/>')[0]
# 将content保存到文件,注意小心重名的情况
def saveToFile(self, content, targetFile):
self.logging("saving file " + targetFile)
file_object = open(self.utf8ToSys(targetFile), 'w')
file_object.write(content)
file_object.close()
def utf8ToSys(self, str):
return str.decode('UTF-8').encode(self.sysEncoding)
def sysToUTF8(self, str):
return str.decode(self.sysEncoding).encode("UTF-8")
# 将回答保存到文件,并按照一定格式命名,只有最后一步才需要转换格式,所以这里不必
def saveAnswerToFile(self, title, vote, date, questionID, answerID, content):
if 'unicode' in str(type(title)):
title = title.encode("utf-8")
fileName = "{}{}[{}]-[{}]-v{}-q{}-a{}.html".\
format(self.saveHtmlDir, self.dirSeparator, date, title, vote, questionID, answerID)
self.saveToFile(content, fileName)
# 验证是否已经登陆
def hasLogin(self, content=None):
if content is None:
content = urllib2.urlopen("http://www.zhihu.com").read()
if content.find(self.loginShowName) != -1 and content.find("我的草稿") != -1:
return True
else:
return False
# 计算最大页码
def getMaxPageNumber(self, d):
pages = d('.zm-invite-pager').text().split(' ')
largestValue = 1
for page in pages:
try:
largestValue = string.atoi(page)
except ValueError:
pass
return largestValue
# 获得文件夹下,最近的回答的文件名
def getLatestAnswerFileName(self):
self.logging("try to get the latest answer")
# 按照系统的去查找,但是一旦找到,就转成UTF-8
files = os.listdir(self.utf8ToSys(self.saveHtmlDir))
pattern = re.compile('^\[(\d{4})-(\d{2})-(\d{2})\].*html$')
latestZHYear = ZHYear(0, 0, 0)
latestFileName = None
for file in files:
# 按照系统的去查找,但是一旦找到,就转成UTF-8
file = self.sysToUTF8(file)
match = pattern.findall(file)
for pp in match:
curZHYear = ZHYear(string.atoi(pp[0]), string.atoi(pp[1]), string.atoi(pp[2]))
if curZHYear.newerThan(latestZHYear) >= 0:
latestZHYear = curZHYear
latestFileName = file
if latestFileName is not None:
self.logging("the latest answer is " + latestFileName)
else:
self.logging("no previous answer at all")
return latestFileName
# 通过questionID, answerID匹配文件名
def getFileNameByQAndA(self, questionID, answerID):
files = os.listdir(self.utf8ToSys(self.saveHtmlDir))
pattern = re.compile('.*-q{}-a{}.html$'.format(questionID, answerID))
for file in files:
file = self.sysToUTF8(file)
match = pattern.findall(file)
for pp in match:
return pp
return None
# 计算时间,要注意日期转换
def transToCommonDate(self, dateStr):
pattern = re.compile('^(\d{4}-\d{2}-\d{2})$')
match = pattern.findall(dateStr)
if match:
return dateStr
pattern = re.compile('.*昨天\s\d{1,2}:\d{1,2}')
match = pattern.findall(dateStr)
if match:
yestoday = datetime.date.today() - datetime.timedelta(days=1)
return yestoday
pattern = re.compile('^\d{1,2}:\d{1,2}$')
match = pattern.findall(dateStr)
if match:
return datetime.date.today()
return "2000-01-01"
# 备份文件
def backupFiles(self):
self.logging("backup dir " + self.saveHtmlDir +
" -->> " + self.backupDir, True)
shutil.copytree(self.utf8ToSys(self.saveHtmlDir), self.utf8ToSys(self.backupDir))
# 启动线程
def startThread(self):
self.logging("download image, using {} threads".format(self.downloadImageThread))
for i in range(self.downloadImageThread):
ImageThread(self.lock, "thread-" + str(i), self.queue, self.pack).start()
# 检查队列为空,连续5次,即认为结束
def waitForThread(self):
self.logging("waitting for thread to finish", True)
waitRound = 5
while waitRound > 0:
self.lock.acquire()
if self.queue.qsize() == 0:
waitRound -= 1
self.lock.release()
time.sleep(1)
# 通知线程可以退出了
self.lock.acquire()
self.pack.finishedThreadNum = 0
self.lock.release()
self.logging("waiting for thread finish", True)
while True:
self.lock.acquire()
if self.pack.finishedThreadNum >= self.downloadImageThread:
self.lock.release()
break
self.lock.release()
time.sleep(1)
self.logging("main thread finish", True)
# 工作主函数
def work(self,):
self.loadConfig()
# 首先判断是否已经登录,如果登录失败,则退出
loginStatus = self.checkAndLogin()
if not loginStatus:
self.logging("login failed, quit")
return
if self.downloadImage:
self.startThread()
# 第一次存档为True,增量为False
firstTime = self.prepareDirs()
if not firstTime and self.backup:
self.backupFiles()
if self.alwaysGetAll:
firstTime = True
self.getUserAnswers(firstTime)
if self.downloadImage:
self.waitForThread()
self.tagLastModifacationToFile()
# 获取用户的回答
def getUserAnswers(self, all):
# 获取最新的文件的qID和aID
latestFile = self.getLatestAnswerFileName()
latestQID = 0
latestAID = 0
if latestFile is None: # 没有符合格式的文件,需要全抓
all = True
else: # 计算出最新的questionID和answerID
pattern = re.compile('^\[\d{4}-\d{2}-\d{2}\].*-q(\d{1,50})-a(\d{1,50}).html$')
match = pattern.findall(latestFile)
for pp in match:
latestQID = pp[0]
latestAID = pp[1]
# 默认是要抓第一页的,顺便计算回答的总页数
pageContent = urllib2.urlopen("{}?page={}".
format(self.answerURL, self.startPage)).read()
d = pq(pageContent)
pageMax = self.getMaxPageNumber(d)
currentPage = self.startPage
ret = False
while True:
self.logging("parsing page {} of {}".format(currentPage, pageMax), True)
# 如果不是需要全部抓取,那么看看现在抓够了没有
# 遇到老答案之后,再向前寻找10个老答案,并更新
ret = self.parseAnswerAndSave(d, latestQID, latestAID, all)
if not all and ret: # 不用全抓,而且发现了重复
return
if currentPage >= pageMax: # 已经是最后一页
break
# 计算下一页的pq值
currentPage += 1
pageContent = urllib2.urlopen("{}?page={}".
format(self.answerURL, currentPage)).read()
d = pq(pageContent)
# 用@替换可能的特殊字符
def transTitle(self, title):
target = r'<>/\|:"*,?\''
for i in target:
title = title.replace(i, '@')
# 万一注入攻击了呢……
# 这是一种逗逼行为
title = title.replace('/', ' OR ').replace('sudo', 'SUDO').\
replace('rm', 'RM').replace('mv', 'MV')
return title
# 将d中对应的每个答案都copy到一个html中
def parseAnswerAndSave(self, d, latestQID, latestAID, getAll):
zmAll = d('#zh-profile-answer-list')('.zm-item')
self.logging("parsing answers, there are {} answers in this page".format(len(zmAll)))
for zm in zmAll:
ele = pq(zm)
title = self.transTitle(ele(".question_link").html())
vote = ele(".zm-item-vote-count").html()
# date有多种格式,比如2014-01-01, 昨天15:30, 11:15等
date = ele(".zm-item-rich-text")(".answer-date-link").text()
dateAll = date.split(" ")
date = self.transToCommonDate("{}".format(dateAll[len(dateAll)-1]))
# 不论答案如何,把url全拉下来去请求
answerURL = "http://www.zhihu.com" + ele(".question_link").attr('href')
pattern = re.compile(r'/question/(\d{1,50})/answer/(\d{1,50})')
match = pattern.findall(answerURL)
for pp in match:
questionID = pp[0]
answerID = pp[1]
self.logging("trace answer: " + answerURL)
content_stream = urllib2.urlopen(answerURL)
pageContent = content_stream.read()
# 不用下载图片的话,就不需要调用这个函数了
if self.downloadImage:
pageContent = self.downloadImageAndReplace(pageContent)
# 判断是否已经有本地存档。要全抓取的时候latest=0,不会出现误判,然后遍历oldLimit个答案并覆盖
if not getAll and latestQID == questionID and latestAID == answerID:
self.logging("meet old answer, will parse {} more answers".
format(self.oldLimit), True)
self.hasMeetOld = True
# 先删除老文件,防止匹配把新文件删了
if self.hasMeetOld or getAll:
self.oldLimit -= 1
oldFileName = self.getFileNameByQAndA(questionID, answerID)
if oldFileName is not None:
self.logging("covering " + oldFileName, True)
# 这个是直接的文件操作
os.remove(self.utf8ToSys(self.saveHtmlDir + self.dirSeparator + oldFileName))
self.saveAnswerToFile(title, vote, date, questionID, answerID,
pageContent.strip())
if not getAll and self.oldLimit <= 0:
self.logging("enough old files, stop parsing", True)
return True
# sleep一个随机时间
timeToSleep = random.uniform(self.sleepMin, self.sleepMax)
if 'unicode' in str(type(title)):
title = title.encode("utf-8")
self.logging("save answer of {} finished, sleep {} seconds for next request".\
format(title, timeToSleep))
time.sleep(timeToSleep)
return False
# 把要下载的源地址和存储路径存到队列中
def putIntoQueue(self, src, des):
self.lock.acquire()
# 路径转成系统encoding
tup = (src, self.utf8ToSys(des))
self.queue.put(tup)
self.logging("queue length {} aftre put {} into queue".
format(self.queue.qsize(), tup))
self.lock.release()
# 把绝对路径替换为相对路径,同时下载图片
def downloadImageAndReplace(self, content):
pattern = re.compile('<img src="//(s\d.zhimg.com/misc/whitedot.jpg)(".{1,500})data-actualsrc="http://(pic\d.zhimg.com/\w{1,50}.jpg)(".{0,300})>')
match = pattern.findall(content)
for line in match:
pairs = line[2].split('/')
dir = pairs[0]
name = pairs[1]
if not os.path.exists(self.saveHtmlDir + self.dirSeparator + dir):
os.mkdir(self.saveHtmlDir + self.dirSeparator + dir)
# 放进队列
self.putIntoQueue("http://" + line[2],
self.saveHtmlDir + self.dirSeparator + dir + self.dirSeparator + name)
# 这是要把原网页中,图片的位置全部替换为本地的文件位置
rawString = '<img src="//' + line[0] + line[1] + 'data-actualsrc="http://' + line[2] + line[3]
#newString = '<img src="' + self.saveHtmlDir + self.dirSeparator + line[2] + line[1] + 'data-actualsrc="' + self.saveHtmlDir + self.dirSeparator + line[2] + line[3]
newString = '<img src="'+ line[2] + line[1] + 'data-actualsrc="' + self.saveHtmlDir + self.dirSeparator + line[2] + line[3]
#self.logging("replace {} -->> {}".format(rawString, newString))
content = content.replace(rawString, newString)
pattern = re.compile('http://(pic\d.zhimg.com/\w{1,50}.jpg)')
match = pattern.findall(content)
for line in match:
pairs = line.split('/')
if not os.path.exists(self.utf8ToSys(self.saveHtmlDir + self.dirSeparator + pairs[0])):
os.mkdir(self.utf8ToSys(self.saveHtmlDir + self.dirSeparator + pairs[0]))
self.putIntoQueue("http://" + pairs[0] + "/" + pairs[1],
self.saveHtmlDir + self.dirSeparator + pairs[0] + self.dirSeparator + pairs[1])
content = content.replace("http://" + pairs[0] + "/" + pairs[1],
pairs[0] + self.dirSeparator + pairs[1])
#self.logging("replace {} -->> {}".format("http://" + pairs[0] + "/" + pairs[1], pairs[0] + self.dirSeparator + pairs[1]))
return content
if __name__ == '__main__':
z = ZhihuGet();
z.work()