/
ecocn_org.py
66 lines (54 loc) · 1.93 KB
/
ecocn_org.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#coding:utf-8
from spider.spider import route, Handler, spider
import _env
from os.path import abspath, dirname, join
from operator import itemgetter
PREFIX = join(dirname(abspath(__file__)))
HTTP = 'http://www.ecocn.org/%s'
@route('/portal\.php')
class portal(Handler):
def get(self):
for link in self.extract_all('<dt class="xs2"><a href="', '"'):
spider.put(HTTP%link)
@route('/article-\d+-\d+.html')
class article(Handler):
def get(self):
link = self.extract( 'class="pn" href="', '" target=""> 中英对照')
spider.put(HTTP%link)
@route('/forum\.php')
class forum(Handler):
from mako.lookup import Template
template = Template(filename=join(PREFIX, 'template/rss.xml'))
page = []
def get(self):
name = self.extract('id="thread_subject">', '</a>')
if not name:
return
name = name.split(']', 1)[-1].strip()
html = self.extract('<div class="t_fsz">', '<div id="comment_')
html = html[:html.rfind('</div>')]
tid = int(self.get_argument('tid'))
print tid, name
self.page.append((tid, self.request.url, name, html))
@classmethod
def write(cls):
page = cls.page
page.sort(key=itemgetter(0), reverse=True)
with open(join(PREFIX, 'ecocn_org.xml'), 'w') as rss:
rss.write(
cls.template.render(
rss_title='经济学人 . 中文网',
rss_link='http://www.ecocn.org',
li=[
dict(
link=link,
title=title,
txt=txt
) for id, link, title, txt in cls.page
]
)
)
if __name__ == '__main__':
spider.put('http://www.ecocn.org/portal.php?mod=list&catid=1')
#10个并发抓取线程 , 网页读取超时时间为30秒
spider.run(10, 30)