-
Notifications
You must be signed in to change notification settings - Fork 0
/
zhilian_crawler_consumer.py
104 lines (89 loc) · 4.06 KB
/
zhilian_crawler_consumer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#-*- coding:utf-8 -*-
__author__ = 'liang'
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
sys.path.append('../')
import time
import cookielib
from common.producer_customer import Customer
from common import webutil
from common import mongoutil
from lxml import etree
import zhilian_crawler_data
import config
import zhilian_crawler_data
class zhilian_crawler_consumer(Customer):
def __init__(self, name,queue_name,process_number=1,redis_host=None,redis_port=None):
Customer.__init__(self, name,queue_name,process_number=process_number,redis_host=redis_host,redis_port=redis_port)
self.mongo = mongoutil.getmondbv2(config.mongo_host, config.mongo_port, config.mongo_db, config.mongo_table)
def run(self,item):
try:
if item != None:
self.crawl(item)
except Exception as e:
print u'抓取%s详情页时异常%s' (item, e)
raise Exception(u'抓取详情页异常')
def crawl(self, url):
if url == None or len(url) < 1:
return
ua = webutil.get_user_agent()
cookieJar = cookielib.MozillaCookieJar()
data_dict = {}
data_dict['type'] = 'zhilian'
data_dict['version'] = 1
data_dict['url'] = url
try:
html_src = webutil.request(url, headers = zhilian_crawler_data.get_search_url_header(), ua = ua, cookie = cookieJar, timeout = 60, retry = 5, encoding = 'utf-8', proxy = None)
if len(html_src) < 100 or len(html_src) > 1024 * 1024 * 10:
raise Exception(u'下载详情页异常')
data_dict['html'] = html_src
self.parse_html(html_src, data_dict)
self.save_data(url, data_dict)
except Exception as e:
print u'下载详情页异常%s' % e
raise Exception(u'下载详情页异常')
def parse_html(self, html_src, data_dic):
try:
tree = etree.HTML(html_src)
job_title_xpath = tree.xpath('.//*[@class="inner-left fl"]/h1/text()')
if job_title_xpath != None and len(job_title_xpath) > 0:
job_title = job_title_xpath[0]
data_dic['job_title'] = job_title
publish_time_xpath = tree.xpath('.//*[@id="span4freshdate"]/text()')
if publish_time_xpath != None and len(publish_time_xpath) > 0:
publish_time = publish_time_xpath[0]
data_dic['publish_time'] = publish_time
work_place_xpath = tree.xpath('.//*[@class="terminal-ul clearfix"]/li[2]/strong/a/text()')
if work_place_xpath != None and len(work_place_xpath) > 0:
work_place = work_place_xpath[0]
data_dic['work_place'] = work_place
work_request_xpath = tree.xpath('.//*[@class="tab-inner-cont"]/p/text()')
if work_request_xpath != None and len(work_request_xpath) > 0:
work_request = ''.join(work_request_xpath)
data_dic['work_request'] = work_request
except Exception as e:
print u'解析html页面异常%s' % e
raise Exception(u'解析html页面异常')
def save_data(self, url, data_dic):
if data_dic == None and len(data_dic) == 0 and not isinstance(data_dic, dict):
return
try:
now = time.strftime('%Y_%m_%d_%H_%M_%S', time.localtime())
if '_id' not in data_dic:
id = mongoutil.get_id_key(url, now)
else:
id = data_dic['_id']
if None in data_dic:
del data_dic[None]
if 'do_time' not in data_dic:
data_dic['do_time'] = now
if 'uptime' not in data_dic:
data_dic['uptime'] = time.time()
mongoutil.updatev3(self.mongo, id, data_dic)
except Exception as e:
print u'保存数据异常%s' % e
raise Exception(u'保存数据异常')
if __name__ == '__main__':
zcc = zhilian_crawler_consumer('zhilian_job_consumer', 'zhilian:detail_url', redis_host = config.redis_host, redis_port = config.redis_port)
zcc.start()