class LagouSitemap(ThreadPoolCrawler): db = get_db('htmldb') col = getattr(db, 'lagou_url') # collection def get(self, url, *args, **kwargs): headers = { 'User-Agent': 'mozilla/5.0 (compatible; baiduspider/2.0; +http://www.baidu.com/search/spider.html)', } return super(LagouSitemap, self).get(url, headers=headers, *args, **kwargs) def handle_response(self, url, response): """处理http响应,对于200响应码直接处理html页面, 否则按照需求处理不同响应码""" self.logger.info('url:%s', url) if response.status_code == 200: self.handle_html(url, response.text) def init_urls(self): for i in range(1, 541): # max is 540 url = 'http://www.lagou.com/upload/sitemap/xml/lagou_sitemap_%d.xml' % i self.urls.append(url) def handle_html(self, url, html): all_loc = extract_all('<loc>', '</loc>', html) self.logger.info('%s', pformat(all_loc)) self.col.insert_many([{'ur': url} for url in all_loc])
def __init__(self): tornado.web.Application.__init__(self, url_patterns, **settings) self._redis = _db.redis_client self._motor = get_db(CONFIG.MONGO.DATABASE, client='motor') connect(CONFIG.MONGO.DATABASE, host=CONFIG.MONGO.HOST, port=CONFIG.MONGO.PORT, io_loop=tornado.ioloop.IOLoop.current()) # motorengine
class CheckXiciCralwer(ThreadPoolCrawler): """CheckXiciCralwer 用来测试代理的有效性,及时剔除没用的代理""" db = get_db('htmldb') col = getattr(db, 'xici_proxy') # collection timeout = (10, 10) # connect timeout and read timeout concurrency = 100 def init_urls(self): """init_urls get all ip proxy from monggo""" url = 'http://www.lagou.com/' for ip_info in self.col.find(no_cursor_timeout=True): ip, port = ip_info['ip'], ip_info['port'] if ip and port: self.urls.append((url, ip, port)) # tuple def get(self, url, proxies, timeout): headers = { 'User-Agent': random_search_engine_ua() } return requests.get( url, proxies=proxies, timeout=timeout, headers=headers ) def run_async(self): self.logger.info('before check %d proixes', self.col.count()) for url_list in chunks(self.urls, 100): # handle 100 every times pprint(url_list) with concurrent.futures.ThreadPoolExecutor(max_workers=self.concurrency) as executor: future_to_url = { executor.submit( self.get, url, proxies=get_proxy_dict(ip, int(port)), timeout=self.timeout ): (url, ip, port) for (url, ip, port) in url_list } for future in concurrent.futures.as_completed(future_to_url): url, ip, port = future_to_url[future] try: response = future.result() if response.status_code != 200: self.logger.info( 'status_code %d ,delete proxy %s:%s', response.status_code, ip, port ) self.col.delete_one({'ip': ip, 'port': port}) except Exception as e: # 之前使用的自己的get导致异常没raise self.logger.info('delete proxy %s:%s', ip, port) self.col.delete_one({'ip': ip, 'port': port}) else: self.handle_response(url, response) self.logger.info('after check %d proixes', self.col.count()) def handle_response(self, url, response): """handle_response 验证代理的合法性。通过发送简单请求检测是否超时""" if response: self.logger.info('url: %s %s', url, response.status_code)
class XiciCrawler(ThreadPoolCrawler): db = get_db('htmldb') col = getattr(db, 'xici_proxy') # collection sleep = 10 def init_urls(self): urls = [ 'http://www.xicidaili.com/wn/%d', 'http://www.xicidaili.com/wt/%d', 'http://www.xicidaili.com/nn/%d', 'http://www.xicidaili.com/nt/%d', ] for url in urls: for i in range(1, 10): self.urls.append(url % i) def bulk_update_to_mongo(self, ip_dict_list): """bulk_update_to_mongo :param ip_dict_list: OrderedDict([('country', 'Cn'), ('ip', u'115.46.80.120'), ('port', u'8123'), ('address', u'\u5e7f\u897f\u5357\u5b81'), ('anonymous', u'\u9ad8\u533f'), ('type', u'HTTP'), ('speed', u'3.687\u79d2'), ('connect_time', u'0.737\u79d2'), ('live_time', u'1\u5206\u949f'), ('verify_time', u'16-07-26 10:54')]) """ bulk = self.col.initialize_ordered_bulk_op() for ip_info_dict in ip_dict_list: self.logger.info('%s', ip_info_dict['ip']) query_dict = { 'ip': ip_info_dict['ip'], 'port': ip_info_dict['port'], } update_dict = {'$set': ip_info_dict} bulk.find(query_dict).upsert().update(update_dict) bulk.execute() self.logger.info('count %d', self.col.count()) def handle_response(self, url, response): """handle_response 把代理ip的信息存储到mongodb中 :param url: :param response: requests.models.Response """ self.logger.info('handle url: %s', url) if not response: return if response.status_code == 200: html = response.text html_parser = XiciHtmlParser(url, html) ip_info_dict_yield = html_parser.parse() self.bulk_update_to_mongo(ip_info_dict_yield) elif response.status_code == 503: change_ip() self.urls.append(url) # retry
class KuaidailiCrawler(ThreadPoolCrawler): """http://www.kuaidaili.com/""" db = get_db('htmldb') col = getattr(db, 'kuaidaili_proxy') # collection sleep = 10 def init_urls(self): _range = 1, 10 for i in range(_range[0], _range[1] + 1): url_list = [ url % i for url in [ 'http://www.kuaidaili.com/free/inha/%d/', 'http://www.kuaidaili.com/free/intr/%d/', 'http://www.kuaidaili.com/free/intr/%d/', 'http://www.kuaidaili.com/free/outha/%d/', ] ] self.urls.extend(url_list) def bulk_update_to_mongo(self, ip_dict_list): bulk = self.col.initialize_ordered_bulk_op() for ip_info_dict in ip_dict_list: self.logger.info('%s:%s', ip_info_dict['ip'], ip_info_dict['port']) query_dict = { 'ip': ip_info_dict['ip'], 'port': ip_info_dict['port'], } update_dict = {'$set': ip_info_dict} bulk.find(query_dict).upsert().update(update_dict) bulk.execute() self.logger.info('count %d', self.col.count()) def handle_response(self, url, response): self.logger.info('handle url: %s', url) if not response: return if response.status_code == 200: html = response.text html_parser = KuaidailiHtmlParser(url, html) ip_info_dict_yield = html_parser.parse() self.bulk_update_to_mongo(ip_info_dict_yield) elif response.status_code == 503: change_ip() self.urls.append(url) # retry
class ParseJob(object): """用来处理抓下来的html页面,把需要的数据从html中提取出来单独存储""" db = get_db('htmldb') def __init__(self): self.from_col = getattr(self.db, 'lagou_html') self.to_col = getattr(self.db, 'lagou_job') self.key = self.__class__.__name__ self.last_id = int(r.get(self.key) or 0) def set_id(self, last_id=0): r.set(self.key, last_id) def run_job(self): """lagou job页面的信息任务""" for doc_dict in self.from_col.find({ '_id': { '$gte': self.last_id } }).sort('_id', 1): if 'job' in doc_dict['url']: # job url doc = ObjectDict(doc_dict) assert doc.url and doc.html if LagouCrawler.is_deleted_html(doc.html, False): self.from_col.delete_one({'url': doc.url}) continue job_parser = LagouHtmlParser(doc.url, doc.html) data_dict = job_parser.parse_job() if data_dict is None: self.from_col.delete_one({'url': doc.url}) continue self.logger.info('handle url: %s %s:%s', doc.url, data_dict['source'], data_dict['job']) if not DEBUG: self.to_col.update({ '_id': doc._id, }, {'$set': data_dict}, upsert=True) self.set_id(doc._id)
url = re.sub('product_id=(\d+)&', 'product_id=%s&' % str(product_id), url) r = requests.get(url, headers=header_dict, data=data) # keyword params # BEHBAAHHHAEAICGV({"result":0,"msg":"","data":{"result":0,"msg":"查询成功","productId":200021,"categoryId":39,"categoryName":"经典","productName":"“新”名片","productImg":"http://cloud.ininin.com/1453727435050.jpg","title":"“新”名片_铜版纸名片设计_铜版纸名片制作_铜版纸名片报价_云印","keywords":"铜版纸名片设计,铜版纸名片制作,铜版纸名片报价","description":"高档300克铜版纸,具有手感厚重,笔直挺括,质地密实、高白度、设计表现强特点。报价:最便宜3.5元至最贵59元/盒(100张),多款铜版纸名片,5种可选铜版纸名片处理工艺。","pImages":"http://cloud.ininin.com/1453727455067.jpg,http://cloud.ininin.com/1453727457303.jpg,http://cloud.ininin.com/1453727459607.jpg,http://cloud.ininin.com/1453727472730.jpg,http://cloud.ininin.com/1453727468168.jpg","priceDesc":"8元/盒起","simpleDesc":"“新”名片【铜版纸】——案头常备的优质名片,满99包邮!","productDesc":"[{\"title\":\"下单流程\",\"content\":\"\u003cp style\u003d\\\"text-align: center;\\\"\u003e\u003cimg src\u003d\\\"http://cloud.ininin.com/1453727509640.jpg\\\"/\u003e\u003c/p\u003e\u003cp style\u003d\\\"text-align: center;\\\"\u003e\u003cimg src\u003d\\\"http://cloud.ininin.com/1453727519881.jpg\\\"/\u003e\u003c/p\u003e\u003cp style\u003d\\\"text-align: center;\\\"\u003e\u003cimg src\u003d\\\"http://cloud.ininin.com/1457590273025.jpg\\\"/\u003e\u003c/p\u003e\u003cp style\u003d\\\"text-align: center;\\\"\u003e\u003cimg src\u003d\\\"http://cloud.ininin.com/1470700220636.png\\\" style\u003d\\\"max-width:100%;\\\"/\u003e\u003c/p\u003e\u003cp\u003e\u003cbr/\u003e\u003c/p\u003e\u003cp\u003e\u003cbr/\u003e\u003c/p\u003e\"},{\"title\":\"产品介绍\",\"content\":\"\u003cdiv style\u003d\\\"text-align: center;\\\"\u003e\u003cimg src\u003d\\\"http://cloud.ininin.com/1453727574011.jpg\\\"/\u003e\u003c/div\u003e\"},{\"title\":\"使用场景\",\"content\":\"\"},{\"title\":\"规格参数\",\"content\":\"\"},{\"title\":\"下单须知\",\"content\":\"\"},{\"title\":\"物流说明\",\"content\":\"\"},{\"title\":\"售后服务\",\"content\":\"\"}]","baseInfoName":"材质类型_数量_成品尺寸-覆膜","preferntialInfo":"[{\"preferentialSort\":1,\"preferentialTitle\":\"优惠套餐\",\"preferentialDescription\":\"购买新名片印刷套餐,立享更多优惠\",\"preferentialLink\":\"http://design.ininin.com/category/131.html\"}]","addedServicesList":[],"params":{"300g铜版纸_1盒_90mm*54mm":{"覆膜":{"覆哑膜":1}},"300g铜版纸_2盒_90mm*54mm":{"覆膜":{"覆哑膜":1}},"300g铜版纸_5盒_90mm*54mm":{"覆膜":{"覆哑膜":1}},"300g铜版纸_10盒_90mm*54mm":{"覆膜":{"覆哑膜":1}},"300g铜版纸_20盒_90mm*54mm":{"覆膜":{"覆哑膜":1}},"300g铜版纸_40盒_90mm*54mm":{"覆膜":{"覆哑膜":1}},"300g铜版纸_100盒_90mm*54mm":{"覆膜":{"覆哑膜":1}}},"type":0,"standardType":0,"showType":0,"websiteShow":1,"homeShow":1,"homeShowIcon":1,"listShow":1,"listShowIcon":2,"minDistNum":-1,"targetId":"0","valuationMethod":0,"valuationValue":0.15,"productVariety":0}}) content = r.content.decode('utf-8') content = content[content.find('(')+1: -2] return json.loads(content).get('data') def _get_product_id_from_url(url='http://www.ininin.com/product/200021.html#300g铜版纸_1盒_90mm*54mm-覆哑膜'): """ _get_product_id_from_url :return: str product_id eg: 200021 """ return url.rsplit('/', 1)[1].split('.')[0] _DB = get_db('ininin', client='mongo') _COL = getattr(_DB, 'ininin_data') def _replace_dot_key_to_dash(data_dict): """_replace_dot_key_to_dash mongo的key中不可以含有点,替换成- :param data_dict: """ params_dict = data_dict.get('params') if not params_dict: return data_dict new_params_dict = {} for k, v in params_dict.items(): k = k.replace('.', '-') new_params_dict[k] = v
class LagouCrawler(object): curl_str = """ curl 'http://www.lagou.com/activityapi/icon/showIcon.json?callback=jQuery11130673730597542487_1469756732278&type=POSITION&ids=2034591%2C2147192%2C1899225%2C2112714%2C1993280%2C2107221%2C1980427%2C959204%2C1570458%2C1382996%2C2164841%2C1535725%2C2015991%2C1909703%2C1924731%2C1924585%2C1917417%2C1961327%2C1949207%2C1949217%2C1961114%2C1962767%2C1915882%2C1958811%2C1929575%2C1929708%2C1926524%2C1914752&_=1469756732279' -H 'Cookie: ctk=1469756728; JSESSIONID=006FA63ABE28DD910325F0A2B21D80DD; LGMOID=20160729094529-D8AB7E5EBC00B32D65F29DC499FDEEE0; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1469756733; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1469756733' -H 'X-Anit-Forge-Code: 0' -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4' -H 'User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36' -H 'Accept: text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01' -H 'Referer: http://www.lagou.com/' -H 'X-Requested-With: XMLHttpRequest' -H 'Connection: keep-alive' -H 'X-Anit-Forge-Token: None' --compressed """ base_url = CurlStrParser(curl_str).get_url() headers = CurlStrParser(curl_str).get_headers_dict() db = get_db('htmldb') col = getattr(db, 'lagou_html') # collection sleep = 10 def __init__(self, domain): self.domain = domain self.url_manager = UrlManager(domain) self.incr_id = IncrId(self.__class__.__name__) def add_url(self, url): self.url_manager.add_url(url) def delay_url(self, url, nums=10): self.logger.info('delay url: %s', url) self.url_manager.delay_url(url, nums) def add_url_list(self): for i in range(1, 532): url = 'http://www.lagou.com/upload/sitemap/xml/lagou_sitemap_%d.xml'%i self.logger.info('sitemap url: %s', url) html = self.get_response(url).text all_loc_url = extract_all('<loc>', '</loc>', html) self.logger.info('%s', pformat(all_loc_url)) self.add_url(all_loc_url) def update_headers(self, changeip=True): if changeip: change_ip() r = get(self.base_url) h = cookie_dict_from_cookie_str(r.headers.get('Set-Cookie')) cookies_dict = cookie_dict_from_cookie_str(self.headers['Cookie']) cookies_dict.update(h) self.headers['Cookie'] = cookies_dict self.logger.info('headers: %s', pformat(self.headers)) def get_response(self, url, **kwargs): if CONFIG.CRAWLER.USE_PROXY: kwargs.setdefault('proxies', CONFIG.CRAWLER.PROXIES) self.logger.info('now crawler: %s', url) return get(url, headers=self.headers, **kwargs) def url_nums(self): return self.url_manager.url_nums() def next_url(self, inorder=True): if inorder: return self.url_manager.first_url() else: return self.url_manager.last_url() def remove_url(self, url): self.logger.info('remove url: %s', url) return self.url_manager.remove_url(url) @staticmethod def is_deleted_html(html, verbose=True): _ = '信息已经被删除' in html if _ and verbose: print('信息已经被删除页面') return _ @staticmethod def is_block_html(html, verbose=True): _ = 'blocked_404' in html if _ and verbose: print('block页面') return _ @staticmethod def is_check_html(html, verbose=True): _ = '访问验证-拉勾网' in html if _ and verbose: print('验证码页面') return _ def save_html(self, url, html): self.logger.info('save html of url: %s', url) if html: self.col.update( { '_id': self.incr_id.get(), 'url': url, }, { '$set': {'html': html} }, upsert=True ) def run(self): if not self.url_nums(): self.add_url_list() self.update_headers() while self.url_nums() > 0: if self.sleep: time.sleep(self.sleep + random.randint(1, 5)) url = self.next_url() if url is not None: r = self.get_response(url) if not r: self.delay_url(url) self.update_headers() continue html = r.text if self.is_block_html(html) or self.is_check_html(html): self.delay_url(url) self.update_headers() continue else: self.save_html(url, html) self.remove_url(url)
class CheckKuaidailiCralwer(CheckXiciCralwer): db = get_db('htmldb') col = getattr(db, 'kuaidaili_proxy') # collection
#!/usr/bin/env python # -*- coding: utf-8 -*- import _env from pprint import pprint as pp from tornado.util import ObjectDict from lib._db import get_db, redis_client as r from html_parser import Bs4HtmlParser from lagou_parser import LagouHtmlParser from spider import LagouCrawler from web_util import logged DEBUG = True db = get_db('htmldb') lagou_html_col = getattr(db, 'lagou_html') # collection def test_get_db(): o = col.find_one({'_id': 1234}) html = o['html'] # print html pp(o) def test_get_html(): import chardet _id = 46167 o = col.find_one({'_id': _id}) p = Bs4HtmlParser('', o['html']) print(p.html) t = p.bs.find('p', class_='msg')