-
Notifications
You must be signed in to change notification settings - Fork 4
/
Spider.py
executable file
·127 lines (104 loc) · 3.55 KB
/
Spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/python3
# -*-utf-8 -*-
import urllib.parse
import asyncio
import aiohttp
try:
from asyncio import JoinableQueue as Queue
except ImportError:
from asyncio import Queue
from bs4 import BeautifulSoup
import re
import os
CL_URL = "http://cl.bearhk.info/thread0806.php?fid=15&search=&page=%s"
CONTENT_TYPE_TEXT = {
'text/html',
'application/xml',
'text/xml',
'text/*'
}
class Request(object):
def __init__(self, spider, url, request_type='get', params=None, data=None,
content_type='text'):
self.spider = spider
self.url = url
self.request_type = request_type
self.params = params
self.data = data
self.content_type = content_type
# append request to spider task queue
self.spider.append_request(self)
def handle_func(self, content):
pass
class Spider:
def __init__(self, max_tries=30, max_tasks=10, timeout=5,
rootDir=os.getcwd()):
self.max_tries = max_tries
self.max_tasks = max_tasks
self.loop = asyncio.get_event_loop()
self.q = Queue(loop=self.loop)
self.session = aiohttp.ClientSession(loop=self.loop)
self.timeout = timeout
self.rootDir = rootDir
def close(self):
self.session.close()
def append_request(self, request):
self.q.put_nowait(request)
@asyncio.coroutine
def _get_request(self):
r = yield from self.q.get()
return r
@asyncio.coroutine
def fetch(self, request_type, url, params, data):
"""Fetch one URL"""
tries = 0
exception = None
while tries < self.max_tries:
try:
print("try %s---->%d times"%(url, tries))
with aiohttp.Timeout(self.timeout):
response = yield from self.session.get(url, params=params)
if response.status == 200:
content_type = response.headers.get('content-type')
if content_type in CONTENT_TYPE_TEXT:
with aiohttp.Timeout(self.timeout):
content = yield from response.text(encoding='GBK')
else:
with aiohttp.Timeout(self.timeout):
content = yield from response.read()
break;
except asyncio.TimeoutError:
print("timeout")
except aiohttp.ClientError as client_error:
print("client error")
except Exception:
print("unknown error")
tries += 1
else:
print("try %s---->more than %d times, quit"%(url, tries))
return None
response.release()
return content
@asyncio.coroutine
def _work(self):
"""Process queue items forever."""
try:
while True:
r = yield from self._get_request()
content = yield from self.fetch(r.request_type, r.url, r.params, r.data)
if(content):
r.handle_func(content)
self.q.task_done()
except asyncio.CancelledError:
pass
@asyncio.coroutine
def work(self):
yield from self._work()
@asyncio.coroutine
def spider(self):
"""run the spider until all finished"""
workers = [asyncio.Task(self.work(),loop=self.loop)
for _ in range (self.max_tasks)]
yield from self.q.join()
for w in workers:
w.cancel()