-
Notifications
You must be signed in to change notification settings - Fork 0
/
manager.py
141 lines (111 loc) · 2.86 KB
/
manager.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
from gevent import monkey; monkey.patch_all()
from gevent.lock import BoundedSemaphore
import gevent
from parser import *
from queue import Queue
from pybloom import ScalableBloomFilter
import json
from redis import Redis
from utilities.util_conf import *
import requests
# 设置协程个数
sem = BoundedSemaphore(BoundedSemaphoreNum)
#读取redis
redis = Redis()
def htmlrun(url, user_type='pc', proxies=None):
sem.acquire()
try:
html = download_html(url, user_type, proxies)
# 如果跳转后的url不是内部url
# if not is_interior_url(redirectUrl, url):
# return
except:
sem.release()
return
# global html
if html == None:
sem.release()
return False
html = HTMLParser().unescape(html)
html = parse.unquote(html)
parser = htmlParser()
parser.feed(html)
htmlSaver(url, html, parser)
print(url, ' 下载成功!')
redis.incr('success')
sem.release()
def cssrun(url):
sem.acquire()
for i in range(REDOWNLOAD_TIMES):
try:
r = requests.get(url)
r.raise_for_status()
css = r.text
cssSaver(url, css)
print(url + '下载成功!')
sem.release()
return
except Exception as e:
print(url, repr(e))
sem.release()
def manager(initUrlList, max_deep=MAX_DEEP, max_pageNum=MAX_PAGENUM\
, crawl_type = CRAWL_TYPE, proxies=PROXIES):
redis.set('success', 0)
# 抓取网站个数
page_num = 0
htmlQueue = Queue()
if isinstance(initUrlList, list):
initUrl = initUrlList[0]
for url in initUrlList:
htmlQueue.put(url)
elif isinstance(initUrlList, str):
initUrl = initUrlList
htmlQueue.put(initUrl)
if max_pageNum == 0:
max_pageNum = -1
if max_deep == 0:
max_deep = 9999
try:
with open('urlBloomfilter.bloom', 'rb') as f:
sbf = ScalableBloomFilter().fromfile(f)
print('bllomfilter 读取成功!')
except:
sbf = ScalableBloomFilter(initial_capacity=10000, error_rate=0.00001,
mode=ScalableBloomFilter.LARGE_SET_GROWTH)
for deep in range(max_deep):
gList = []
while not htmlQueue.empty():
url = htmlQueue.get()
if not url in sbf or deep == 0:
# htmlrun(url)
gList.append(gevent.spawn(htmlrun, url, crawl_type, 'localhost:8087'))
max_pageNum -= 1
page_num += 1
if max_pageNum == 0:
continue
gevent.joinall(gList)
while redis.scard('STATUS') > 0:
url = redis.spop('STATUS').decode()
sbf.add(url)
if max_pageNum == 0:
break
while redis.scard('HTML') > 0:
url = redis.spop('HTML')
url = url.decode()
htmlQueue.put(url)
# 没有url需要爬取
if htmlQueue.empty():
break
# 下载CSS文件
while redis.scard('CSS') > 0:
url = redis.spop('CSS').decode()
url = parse.urljoin(initUrl, url)
gList.append(gevent.spawn(cssrun, url))
gevent.joinall(gList)
# 最后保存Bloomfilter文件
with open('urlBloomfilter.bloom', 'wb') as f:
sbf.tofile(f)
return page_num
if __name__ == '__main__':
initUrlList = []
manager(initUrlList)