def _retry(self, request, reason, spider): retries = request.meta.get('retry_times', 0) + 1 if retries <= self.max_retry_times: logger.debug( "Retrying %(request)s (failed %(retries)d times): %(reason)s", { 'request': request, 'retries': retries, 'reason': reason }, extra={'spider': spider}) retryreq = request.copy() retryreq.meta['retry_times'] = retries retryreq.dont_filter = True retryreq.priority = request.priority + self.priority_adjust request.meta['req_count'] = request.meta.get('req_count', 0) + 1 if retries == self.max_retry_times: if request.meta.get('is_proxy', True): proxymng.delete_proxy(retryreq.meta.get('proxy')) retryreq.meta['proxy'] = proxymng.get_proxy() return retryreq else: logger.debug( "Gave up retrying %(request)s (failed %(retries)d times): %(reason)s", { 'request': request, 'retries': retries, 'reason': reason }, extra={'spider': spider})
def start_request(self, param): city = param.get('city_id', '010') query = param.get('query', 'IOS') page = param.get('page', '1') is_use_proxy = param.get('is_use_proxy', True) page = int(page) - 1 url = 'https://www.liepin.com/zhaopin/?pubTime=&ckid=17c370b0a0111aa5&fromSearchBtn=2&compkind=&isAnalysis' \ '=&init=-1&searchType=1&dqs=%s&industryType=&jobKind=&sortFlag=15&industries=&salary=&compscale' \ '=&clean_condition=&key=%s&headckid=49963e122c30b827&curPage=%s' % (city, query, page) utils.log('liepin url:%s' % url) headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 Firefox/51.0', } self.proxies = proxymng.get_proxy('liepin') if is_use_proxy else None # 重试 3 次抓取 for i in range(0, 3): r = self.request(url, headers, self.proxies) if r != None and r.status_code == 200 and r.ok: # 抓取信息成功 return self.parse_data(r, param) else: if is_use_proxy: # 如果抓取失败,则切换代理 ip proxymng.delete_proxy(self.name, self.proxies) self.proxies = proxymng.get_proxy(self.name) utils.log('liepin request data Exception') return None
def start_request(self, param): city = param.get('city_id', '101010100') query = param.get('query', 'python') page = param.get('page', '1') is_use_proxy = param.get('is_use_proxy', True) url = 'https://www.zhipin.com/c%s/h_%s/?query=%s&page=%s&ka=page-%s' % (city, city, query, page, page) utils.log('boss request url:%s' % url) headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 Firefox/51.0', } self.proxies = proxymng.get_proxy(self.name) if is_use_proxy else None # 重试 3 次抓取 for i in range(0, 3): r = self.request(url, headers, self.proxies) if r != None and r.status_code == 200 and r.ok: # 抓取信息成功 return self.parse_data(r, param) else: if is_use_proxy: # 如果抓取失败,则切换 代理 ip proxymng.delete_proxy(self.name, self.proxies) self.proxies = proxymng.get_proxy(self.name) utils.log('boss request data Exception') return None
def process_exception(self, request, exception, spider): logging.error( 'process_exception error_request request exception:%s url:%s proxy:%s' % (exception, request.url, str(request.meta))) if request.meta.get('is_proxy', True): proxymng.delete_proxy(request.meta.get('proxy')) request.meta['proxy'] = proxymng.get_proxy() return request
def error_parse(self, failure): request = failure.request utils.log('error_parse url:%s meta:%s' % (request.url, request.meta), logging.ERROR) proxy = request.meta.get('proxy', None) if proxy: proxymng.delete_proxy(proxy) request.meta['proxy'] = proxymng.get_proxy() request.priority = request.priority + self.priority_adjust yield request
def start_request(self, param): city = param.get('city_name', '上海') query = param.get('query', 'IOS') page = param.get('page', '1') is_use_proxy = param.get('is_use_proxy', True) city_encoded = urllib.urlencode({'city': city.encode('utf-8')}) url = 'https://www.lagou.com/jobs/positionAjax.json?{0}&needAddtionalResult=false'.format( city_encoded) utils.log('lagou request url:%s' % url) data = { 'first': 'true', 'kd': query, 'pn': page, } self.proxies = proxymng.get_proxy('lagou') if is_use_proxy else None with open('spider/lagou_cookies.text', 'r') as f: cookies = f.read() f.close() cookies = json.loads(cookies) headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 Firefox/51.0', } for i in range(0, 3): r = self.request(url=url, headers=headers, cookies=cookies, data=data, proxies=self.proxies) if r != None and r.status_code == 200 and r.ok: # 抓取信息成功 return self.parse_data(r, param) else: if is_use_proxy: # 如果抓取失败,则切换 代理 ip proxymng.delete_proxy(self.name, self.proxies) self.proxies = proxymng.get_proxy(self.name) utils.log('lagou request data Exception') return None