forked from dawn110110/ssdut_news_server
/
crawler.py
154 lines (132 loc) · 4.95 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/usr/bin/env python
#encoding=utf-8
from models import *
import traceback
from sqlalchemy import func
import db
import parser as par
import config
from utils import TornadoFormatter
import time
import logging
from urllib2 import urlopen
import string
SITE_URL = 'http://ssdut.dlut.edu.cn'
class SSdutSiteCrawler(object):
def __init__(self):
''' use tornaod LogFormatter '''
self._news_url_template = string.Template(
SITE_URL+"/index.php/News/student/p/$p/")
self._init_going = False
def page_url(self, p):
url = self._news_url_template.substitute(p=p)
logging.debug("page url = %r" % url)
return url
def get_page_result(self, p):
src = urlopen(self.page_url(p)).read()
return par.ssdut_news_list(src)
def update_db(self, p=1):
# TODO fix hole , update
db_max_id = db.ses.query(func.max(New.id)).one()[0]
site_res = self.get_page_result(1)
logging.info("records on site = %r, max_id in db = %r" %
(site_res.total_records, db_max_id))
news_id = site_res.total_records
if db_max_id < site_res.total_records:
n = site_res.total_records - db_max_id
logging.info("will update %r news" % n)
# updte news here
# assume that, n<=12
for new in site_res.news_list:
if n <= 0:
break
n -= 1
print n
# do update
src = urlopen(SITE_URL + new['link']).read()
detail = par.ssdut_news_parse(src)
r = New(
id=news_id,
raw=detail.raw,
title=detail.title,
link=new['link'],
body=detail.body,
clean_body=detail.clean_body,
date=detail.date,
publisher=detail.publisher,
source=detail.source,
source_link=new['source_link'],
sha1=detail.sha1,
search_text=detail.search_text)
logging.info("%r added to db, id = %r" % (r, r.id))
db.ses.add(r)
db.ses.commit()
news_id -= 1
else:
logging.info("no news to be update")
logging.debug("update finish")
def reset_news_db(self):
''' get the first 10 pages news and store them in db'''
# delete all records in db
for r in New.query.all():
db.ses.delete(r)
db.ses.commit()
logging.debug("delete all news records in db")
# get all the news links
res_list = []
for p in xrange(1, 220):
res_list.append(self.get_page_result(p))
# get news detail and store in db
news_id = res_list[0].total_records
for page in res_list:
for new in page.news_list:
#try:
src = urlopen(SITE_URL + new['link']).read()
detail = par.ssdut_news_parse(src)
r = New(
id=news_id,
raw=detail.raw,
title=detail.title,
link=new['link'],
body=detail.body,
clean_body=detail.clean_body,
date=detail.date,
publisher=detail.publisher,
source=detail.source,
source_link=new['source_link'],
sha1=detail.sha1,
search_text=detail.search_text)
db.ses.add(r)
db.ses.commit()
logging.info("%r, added, link=%r, page_no = %r" %
(r, r.link, page.page_no))
news_id -= 1
#except:
# traceback.print_exc()
# logging.error("error, r= %r" % r )
# logging.error("page no = %r" % page.page_no)
#finally:
# news_id -= 1
if __name__ == "__main__":
updater = SSdutSiteCrawler()
# set up the log format
lg = logging.getLogger()
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.DEBUG)
# console_handler.setFormatter(TornadoFormatter(color=True))
file_handler = logging.FileHandler('crawler.log')
file_handler.setLevel(logging.DEBUG)
# file_handler.setFormatter(TornadoFormatter(color=False))
lg.addHandler(console_handler)
lg.addHandler(file_handler)
lg.setLevel(logging.DEBUG)
if kv.db_inited:
logging.info("Initial data already loaded, begin updating")
else:
logging.info("begin crawling initial data...")
updater.reset_news_db()
kv.db_inited = 'true'
logging.info("db init finished")
while True:
updater.update_db()
time.sleep(config.update_interval)