-
Notifications
You must be signed in to change notification settings - Fork 0
/
tianyajingji.py
132 lines (125 loc) · 4.89 KB
/
tianyajingji.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# -*- coding: utf-8 -*-
# __author__ = 'peter'
from models import Centipede, StaticContent, Stanza
import scrapemark
import logging
from google.appengine.api import memcache
from mapreduce import operation as op
from google.appengine.api import urlfetch
from google.appengine.api import images
from google.appengine.ext import deferred, db
import urlparse
from google.appengine.ext import deferred
import mechanize
import re, itertools
from string import Template
from bottle import template
import bottle
import urllib2
import datetime
url_template=Template('http://www.tianya.cn/new/publicforum/articleslist.asp?pageno=${pageno}&stritem=develop')
threshold=1000
encoding='gb18030'
bottle.TEMPLATE_PATH.insert(0, './templates/')
host_url='http://baizu.samdeha.com/'
HTTP_DATE_FMT = '%Y-%m-%d %H:%M:%S'
def threads():
tyjj=(url_template.substitute(pageno=str(i)) for i in range(10, 0, -1))
for url in tyjj:
for thread in scrapemark.scrape("""
{*
<table name=''>
<tr><td><a href='{{ [threads].url }}'>{{ [threads].title}}</a></td>
<td><a>{{ [threads].author }}</a></td>
<td>{{ [threads].views|int }}</td>
<td>{{ [threads].comments|int }}</td>
<td></td>
</table>
*}
""", url=url, encoding=encoding)['threads']:
yield thread
def crawl():
hot=({'url':thread['url'], 'author':thread['author'], 'title': thread['title'], 'comments':thread['comments'], 'views':thread['views'], 'stanzas':{}} for thread in threads() if thread['comments']>threshold)
for thread in hot:
deferred.defer(process, thread)
def process(thread):
for url in pages(thread):
stanza_template=Template(u"""
{*
<table id="firstAuthor">
<tr>
<td>
<a>${author}</a> 发表日期:{{ [stanzas].datetime }}
</td>
</tr>
</table>
<div id="pContentDiv">
<div class="post">
{{ [stanzas].content|html }}
</div>
</div>
*}
{*
<table>
<tr>
<td>
<a>${author}</a> 回复日期:{{ [stanzas].datetime }}
</td>
</tr>
</table>
<div class="post">
{{ [stanzas].content|html }}
</div>
*}
""")
logging.info(thread['author'])
pattern=scrapemark.compile(stanza_template.substitute(author=thread['author']))
logging.info(pattern)
thread['stanzas'][url]=scrapemark.scrape(pattern, url=url, encoding=encoding)['stanzas']
logging.info(thread['stanzas'][url])
def new_stanzas(thread, centipede):
for url, stanzas in thread['stanzas'].items():
for stanza in stanzas:
yield Stanza(parent=centipede, page_url=url, content=db.Text(stanza['content']), published=datetime.datetime.strptime(re.sub(u'\u3000', ' ', stanza['datetime']), HTTP_DATE_FMT))
def pages(thread):
centipede=Centipede.get_by_key_name(thread['url'])
d={}
urls=[]
logging.info(thread['url'])
centipede_url_components=urlparse.urlparse(thread['url'])
centipede_url_netloc_path=centipede_url_components.netloc + centipede_url_components.path
for page in scrapemark.scrape("""
<div class="pages" id="pageDivTop">
{*
<a href="{{ [pages] }}"></a>
*}
<span></span>
</div>
""", url=thread['url'])['pages'][:-1]:
d[page]=1
if centipede is None:
centipede=Centipede(key_name=thread['url'], species=db.Category(u'天涯经济'), author=thread['author'], title=thread['title'], comments=thread['comments'], views=thread['views'], pedes=[])
urls=[db.Link(thread['url'])]
urls.extend([db.Link(key) for key in d.keys()[:-2]])
qr_key=centipede_url_components.netloc + '.'.join([centipede_url_components.path.split('.')[0],'png'])
img=urlfetch.fetch('http://chart.apis.google.com/chart?cht=qr&chs=200x200&chl='+urllib2.quote(host_url+centipede_url_netloc_path))
qr_content=StaticContent(key_name=qr_key, body=img.content, content_type='image/png')
qr_content.put()
else:
urls=[db.Link(centipede.next)]
urls.extend([db.Link(url) for url in d.keys()[:-2] if url not in centipede.pedes])
logging.info(urls)
centipede.pedes.extend(urls)
centipede.next=db.Link(d.keys()[-2])
for url in urls:
yield url
centipede.put()
content=StaticContent.get_by_key_name(centipede_url_netloc_path)
stanzas=[stanza for stanza in new_stanzas(thread, centipede)]
if content is None:
content=StaticContent(key_name=centipede_url_netloc_path, template=db.Text(template('centipede.html', centipede=centipede, stanzas=stanzas, template_next=True)), content_type='text/html')
else:
content.template=db.Text(template(content.template, centipede=centipede, stanzas=stanzas, template_next=True))
# db.put(stanzas)
content.put()
memcache.delete(content.key().name())