-
Notifications
You must be signed in to change notification settings - Fork 1
/
22mmspider.py
202 lines (173 loc) · 7.21 KB
/
22mmspider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
#!/usr/bin/env python2
# -*- coding: UTF-8 –*-
from bs4 import BeautifulSoup
from optparse import OptionParser
import urlparse2
import urllib2
import redis
import threading
import os
import time
import logging
import revise_wgetter
class Base(object):
u"""22mm 小爬虫基类 ."""
def __init__(self, url, image_max_num, thread_num, file_path):
u"""初始化爬虫
Args:
image_max_num :
thread_num :
file_path :
"""
self.url = url
self.image_max_num = image_max_num
self.thread_num = thread_num
self.file_path = file_path
self.client = redis.StrictRedis(host='localhost', port=6379)
self.img_url_pattern = []
self.url_pattern = []
logging.basicConfig(filename=os.path.join(os.getcwd(), 'log.txt'),
level=logging.WARNING, filemode='w',
format='%(asctime)s - %(levelname)s: %(message)s')
def startspider(self):
u"""爬虫开始函数"""
self.redis_prepare()
self.check_file_path()
self.thread_control()
while int(self.client.get('image_max_num')) is not 0:
pass
#self.client.flushall()
def redis_prepare(self):
u"""redis 内存数据库准备工作."""
self.client.setnx('image_max_num', self.image_max_num)
self.client.lpush('web_url_goto', self.url)
self.client.sadd('web_url_visited', self.url)
self.client.sadd('image_url_visited', self.url)
self.client.setnx('image_downloaded_num', 0)
def thread_control(self):
u"""线程控制.
根据thread_num开启对应线程以进行解析、爬取、下载
"""
for i in xrange(self.thread_num):
threading.Thread(target=self._extract_url, args=(), name='url_tread').start()
threading.Thread(target=self.download, args=(), name='download_thread').start()
def check_file_path(self):
u"""判断存储目录是否存在,如果不存在则创建目录."""
if not os.path.exists(self.file_path):
os.makedirs(self.file_path)
return 0
else:
return 1
def _extract_url(self):
u"""根据网址进行链接解析.
Args:
url: 待分析网页网址
Returns:
本链接网页内的同域名网址列表
"""
while int(self.client.get('image_max_num')) is not 0:
if self.client.llen('web_url_goto') is 0:
time.sleep(1)
else:
url = self.client.rpop('web_url_goto')
try:
html = urllib2.urlopen(url).read()
except:
logging.warning("url cant open: %s" % url)
continue
domain = urlparse2.urlparse(url).netloc
web_url_list = self._extract_web_url(html, url, domain)
image_url_list = self._extract_img_url(html, domain)
for web_url in web_url_list:
if int(self.client.sismember('web_url_visited', web_url)) is 0:
self.client.sadd('web_url_visited', web_url)
self.client.lpush('web_url_goto', web_url)
for image_url in image_url_list:
if int(self.client.sismember('image_url_visited', image_url)) is 0:
self.client.sadd('image_url_visited', image_url)
self.client.lpush('image_url_goto', image_url)
logging.info("%s--->%s" % (url, image_url))
def _extract_web_url(self, html, url, domain):
u"""从html内容中解析出同域名网址列表.
Args:
html: 待解析的内容
url: 爬取页面的地址
domain: 当前网站域名
Return:
html内容中的同域名网址
"""
url_list = []
content = BeautifulSoup(html).findAll('a')
for item in content:
href = item.get('href')
ans = urlparse2.urljoin(url, href)
ans_netloc = urlparse2.urlparse(ans).netloc
if domain == ans_netloc:
url_list.append(ans)
return url_list
def _extract_img_url(self, html, domain):
u"""从html内容中解析出同域名图片列表.
Args:
html: 待解析的内容
domain: 当前网站域名
Return:
html内容中的同域名图片网址列表
"""
image_div = BeautifulSoup(html).find(id="box-inner")
image_url_list = []
if image_div is not None:
image_script = None
for div_children in image_div.children:
if div_children.name == "script":
image_script = div_children.string
image_url = image_script.split('"')[1]
urls = image_url.split('big')
image_url = urls[0]+'pic'+urls[1]
image_url_list.append(image_url)
return image_url_list
def download(self):
u"""下载对应的文件.
Args:
url: 下载链接
Return:
下载文件名称,若下载不成功,则返回None
"""
while int(self.client.get('image_max_num')) is not 0:
if self.client.llen('image_url_goto') is 0:
time.sleep(1)
else:
if int(self.client.get('image_max_num')) is -1:
image_url = self.client.rpop('image_url_goto')
self.client.incr('image_downloaded_num')
try:
revise_wgetter.download(image_url, int(self.client.get('image_downloaded_num')), self.file_path)
except:
self.client.incr('image_max_num')
logging.warning('download error:%s' % image_url)
else:
self.client.decr('image_max_num')
image_url = self.client.rpop('image_url_goto')
self.client.incr('image_downloaded_num')
try:
revise_wgetter.download(image_url, int(self.client.get('image_downloaded_num')), self.file_path)
except:
self.client.incr('image_max_num')
logging.warning('download error:%s' % image_url)
if __name__ == "__main__":
parser = OptionParser()
parser.add_option("-o", "--thread", dest="thread_num", default='10', type='int', help="thread_num")
parser.add_option("-f", "--file_path", dest="file_path", help="write report to FILE", metavar="./pics",
default='./pics')
parser.add_option("-i", "--image_max_num", dest="image_max_num", help="image max download num", default=-1,
type="int")
options, args = parser.parse_args()
if len(args) > 0:
url = args[0]
thread_num = options.thread_num
file_path = options.file_path
image_max_num = options.image_max_num
spider = Base(url, image_max_num, thread_num, file_path)
spider.startspider()
print "爬取结束"
else:
print "error:please put in the right url"