Пример #1
0
    def process_response(self, request, response, spider):        
        if 'dont_retry' in request.meta:
            print 'dont retry in meta'
            return response
        #print "response.status = %s" % (response.status)
        #print "request.url = %s" % (request.url)
        
        #if not (str(request.url).find("SearchGroupedFlightsJSONMinimum")>-1):            
        #    if str(response.body).find("[null,null]")>-1:
        #        print "response.body = %s" % (response.body)
        #        reason = response_status_message(400)
        #        return self._retry(request, reason, spider) or response
            
        uuids = re.findall('\w{8}-\w{4}-\w{4}-\w{4}-\w{12}', response.body)
        price = re.findall('[0-9]*\.[0-9]{2}RoundTrip', response.body)
        
        if(len(uuids)>0):
            print "uuids: %s" % (uuids)

        if response.status in [200] and (str(request.url).find("SearchGroupedFlightsJSONMinimum")>-1) and int(request.meta.get('dormiu_bool', 0))<1:
            reason = response_status_message(response.status)
            segundos = random.randint(10, 15)
            print "Espera a resposta:"
            print "Dormindo %ss..." % (segundos)
            #time.sleep(15)
            time.sleep(segundos)
            #retryreq = request.copy()
            #retryreq.meta['dormiu_bool'] = 1
            request.meta['dormiu_bool'] = 1
            #return self._retry(retryreq, reason, spider) or response    

        if response.status in self.retry_http_codes:
            print "Voltou erro 400, tenta de novo!"
            reason = response_status_message(response.status)
            return self._retry(request, reason, spider) or response
        
        if uuids[0]=='00000000-0000-0000-0000-000000000000':
            print "Uid 000, tenta de novo!"
            retries_uuid = request.meta.get('retry_times_uuid', 0) + 1
            if retries_uuid <= self.max_retry_wrong_uuid:
                request.meta['retry_times_uuid'] = retries_uuid
                print "uuids error!: %s" % (uuids)
                print "uuids retry count: %s" % (retries_uuid)
                reason = response_status_message(400)
                return self._retry(request, reason, spider) or response        
        
        if not (str(request.url).find("SearchGroupedFlightsJSONMinimum")>-1):
            print "Nao tem preco ainda, tenta de novo"
            print "price: %s" % (price)
            if not len(price)>0:
                print "dorme e espera preco!"
                #dorme um pouco
                time.sleep(random.randint(2, 7))
                reason = response_status_message(400)
                return self._retry(request, reason, spider) or response
                        
        return response
    def _redirect(self, redirected, request, spider, reason):
        reason = response_status_message(reason)
        ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times)
        redirects = request.meta.get('redirect_times', 0) + 1
        if spider.name == "amazon" and redirected.url[11:17] != "amazon" and redirects <= self.max_redirect_times:
            spider.logger.info("redirect to wrong url: %s" % redirected.url)
            new_request = request.copy()
            new_request.dont_filter = True
            new_request.meta["redirect_times"] = redirects
            spider.logger.info("in _redirect redirect_times: %s re-yield response.request: %s" % (redirects, request.url))
            return new_request

        if ttl and redirects <= self.max_redirect_times:
            redirected.meta['redirect_times'] = redirects
            redirected.meta['redirect_ttl'] = ttl - 1
            redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \
                                               [request.url]
            redirected.dont_filter = request.dont_filter
            redirected.priority = request.priority + self.priority_adjust
            self.logger.debug("Redirecting %s to %s from %s for %s times "%(reason, redirected.url, request.url, redirected.meta.get("redirect_times")))
            return redirected
        else:
            self.logger.debug("Discarding %s: max redirections reached"%request.url)
            request.meta["url"] = request.url
            if request.meta.get("callback") == "parse":
                spider.crawler.stats.inc_total_pages(crawlid=request.meta['crawlid'],
                                                     spiderid=request.meta['spiderid'],
                                                     appid=request.meta['appid'])
            spider._logger.info(
                " in redicrect request error to failed pages url:%s, exception:%s, meta:%s" % (request.url, reason, request.meta))
            self.stats.set_failed_download_value(request.meta, reason)
            raise IgnoreRequest("max redirections reached")
Пример #3
0
 def process_response(self, request, response, spider):
     if 'dont_retry' in request.meta:
         return response
     if response.status in self.retry_http_codes:
         reason = response_status_message(response.status)
         return self._retry(request, reason, spider) or response
     return response
Пример #4
0
 def process_response(self, request, response, spider):
     if response.status in self.retry_http_codes:
         reason = response_status_message(response.status)
         # 删除该代理
         self.delete_proxy(request.meta.get('proxy', False))
         print('返回值异常, 进行重试...')
         return self._retry(request, reason, spider) or response
     return response
Пример #5
0
    def process_response(self, request, response, spider):
        if response.status in [301, 302]:
            print('retry ' + response.url)
            sleep(360)  # few minutes
            reason = response_status_message(response.status)
            return self._retry(request, reason, spider) or response

        return super(SleepRetryMiddleware, self).process_response(request, response, spider)
Пример #6
0
    def process_response(self, request, response, spider):
        log.msg('KxRetry process_response ===========')
        if 'dont_retry' in request.meta:
            return response

        if response.status != 200:
            reason = response_status_message(response.status)
            return self._retry(request, reason, spider) or response
        return response
Пример #7
0
 def process_response(self, request, response, spider):
     '''对特定的http返回码进行重新抓取,主要针对500和599等'''
     if "proxy" in request.meta:
         logger.debug("Use proxy: " + request.meta["proxy"] + "to crawler")
     if request.meta.get('dont_retry', False):
         return response
     if response.status in self.retry_http_codes:
         reason = response_status_message(response.status)
         self._del_invaild_proxy(request)
         return self._retry(request, reason, spider) or response
     return response
Пример #8
0
 def process_response(self, request, response, spider):
     if request.meta.get('dont_retry', False):
         return response
     retry_http_codes = self.retry_http_codes
     temporary_codes = request.meta.get('retry_http_codes', [])
     if temporary_codes:
         retry_http_codes |= set(int(x) for x in temporary_codes)
     if response.status in retry_http_codes:
         reason = response_status_message(response.status)
         return self._retry(request, reason, spider) or response
     return response
 def process_response(self, request, response, spider):
     if 'dont_retry' in request.meta:
         return response
     if response.status in self.retry_http_codes:
         reason = response_status_message(response.status)
         try:
             request.headers.pop('Proxy-Authorization')
         except:
             pass
         return self._retry(request, reason, spider) or response
     return response
Пример #10
0
 def process_response(self, request, response, spider):
     if response.status in [300, 301, 302, 303]:
         try:
             reason = response_status_message(response.status)
             return self._retry(request, reason, spider) or response  # 重试
         except Exception as e:
             raise IgnoreRequest
     elif response.status in [403, 414]:
         logger.error("%s! Stopping..." % response.status)
         os.system("pause")
     else:
         return response
Пример #11
0
 def process_response(self, request, response, spider):
     if request.meta.get('dont_retry', False):
         return response
     if response.status in self.retry_http_codes:
         reason = response_status_message(response.status)
         # 删除该代理
         print('代理失效:', request.meta.get('proxy'))
         self.delete_proxy(request.meta.get('proxy'))
         # time.sleep(random.randint(3, 5))
         self.logger.warning('返回值异常, 进行重试...')
         return self._retry(request, reason, spider) or response
     return response
 def process_response(self, request, response, spider):
     if 'dont_retry' in request.meta:
         return response
     if response.status in self.retry_http_codes:
         #recode exception time and suspend spider
         key = self.genKey()
         incAttr(self.status, key)
         if self.maxExceptionTime and self.status[key] >= self.maxExceptionTime:
             time.sleep(self.suspendTime)
         reason = response_status_message(response.status)
         return self._retry(request, reason, spider) or response
     return response
Пример #13
0
    def process_response(self, request, response, spider):
        if request.meta.get('dont_retry', False):
            return response

        if response.status in self.retry_http_codes:
            reason = response_status_message(response.status)
            return self._retry(request, reason, spider) or response

        if response.url.startswith("http://safety.autohome.com.cn"):
            reason = "userverify retry "
            return self._retry(request, reason, spider) or response
        return response
Пример #14
0
 def enrich_base_data(self, item_loader, response):
     item_loader.add_value('spiderid', response.meta.get('spiderid'))
     item_loader.add_value('url', response.request.url)
     item_loader.add_value("seed", response.meta.get("seed", ""))
     item_loader.add_value("timestamp", time.strftime("%Y%m%d%H%M%S"))
     item_loader.add_value('status_code', response.status)
     item_loader.add_value("status_msg",
                           response_status_message(response.status))
     item_loader.add_value('domain',
                           urlparse(response.url).hostname.split(".", 1)[1])
     item_loader.add_value('crawlid', response.meta.get('crawlid'))
     item_loader.add_value('response_url', response.url)
Пример #15
0
    def process_response(self, request, response, spider):

        if request.meta.get('dont_retry', False):
            return response

        if response.status in self.retry_http_codes:
            reason = response_status_message(response.status)
            self._check_expire()
            request.meta['proxy'] = 'http://' + self.proxy

            return self._retry(request, reason, spider) or response
        return response
Пример #16
0
 def process_response(self, request, response, spider):
     if not request.meta.get('dont_retry', False):
         reason = response_status_message(response.status)
         if response.status in self.retry_http_codes:
             # 捕获错误
             print(f"======出现{response.status}错误, url:{response.url}")
             return self._retry(request, reason, spider) or response
         elif response.status in [301, 302]:
             # 重定向错误 可能出现验证码
             print(f"======出现{response.status}错误(重定向), url:{response.url}")
             return self._retry(request, reason, spider) or response
         return response
     return response
Пример #17
0
    def process_response(self, request, response, spider):

        if request.meta.get('dont_retry', False):
            return response
        if response.status in self.retry_http_codes:
            reason = response_status_message(response.status)
            return self._retry(request, reason, spider) or response
        if response.status == 403:
            """
            单独处理封IP的情况
            删除代理重新请求
            """
            proxy_spider = request.meta.get('proxy')
            proxy_redis = proxy_spider.split("//")[1]
            self.delete_proxy(proxy_redis)
            reason = response_status_message(response.status)
            return self._retry(request, reason, spider) or response
        if response.status in [301, 302, 400, 503]:
            reason = response_status_message(response.status)
            return self._retry(request, reason, spider) or response

        return response
Пример #18
0
    def process_response(self, request, response, spider):
        if request.meta.get('dont_retry', False):
            return response
        if response.status in self.retry_http_codes:
            reason = response_status_message(response.status)
            return self._retry(request, reason, spider) or response
        if response.status in [403, 416]:
            """
            单独处理封IP的情况
            删除代理重新请求
            """
            proxy_spider = request.meta.get('proxy')
            proxy_redis = proxy_spider.split("//")[1]
            logger.info('IP被封,删除代理重试')
            self.delete_proxy(proxy_redis)
            reason = response_status_message(response.status)
            return self._retry(request, reason, spider) or response
        if response.status == [503, 504]:
            reason = response_status_message(response.status)
            return self._retry(request, reason, spider) or response

        return response
Пример #19
0
    def process_response(self, request, response, spider):
        if request.meta.get('dont_retry', False):
            return response
        if response.status in self.retry_http_codes:
            reason = response_status_message(response.status)
            return self._retry(request, reason, spider) or response

        # this is your check
        if response.status == 200 and response.xpath(spider.retry_xpath):
            return self._retry(
                request, 'response got xpath "{}"'.format(spider.retry_xpath),
                spider) or response
        return response
Пример #20
0
 def process_response(self, request, response, spider):
     if request.meta.get('dont_retry', False):
         return response
     elif response.status == 429:
         print(f'request{spider.counter}')
         print(f'accounts{print_coll()}')
         print(f'429 {time.localtime()}')
         self.crawler.engine.pause()
         time.sleep(331)
         self.crawler.engine.unpause()
         reason = response_status_message(response.status)
         return self._retry(request, reason, spider) or response
     return response
Пример #21
0
 def process_douban_response(self, request, response, spider):
     if response.status in [403, 414, 302]:
         reason = response_status_message(response.status)
         print('change ip proxy and retrying...')
         proxyres = requests.get('http://proxy.nghuyong.top').text
         totalproxies = json.loads(proxyres)['num']
         if (totalproxies > 0):
             proxylist = json.loads(proxyres)['data']
             proxy = random.choice(proxylist)
             request.meta['proxy'] = "http://" + proxy['ip_and_port']
             return self._retry(request, reason, spider)
     else:
         return response
Пример #22
0
 def _enrich_base_data(self, response):
     item = self.get_item_cls()()
     item['spiderid'] = response.meta['spiderid']
     item['workerid'] = self.worker_id
     item['url'] = response.meta["url"]
     item["seed"] = response.meta.get("seed", "")
     item["timestamp"] = time.strftime("%Y%m%d%H%M%S")
     item['status_code'] = response.status
     item["status_msg"] = response_status_message(response.status)
     item['domain'] = urlparse(response.url).hostname.split(".", 1)[1]
     item['crawlid'] = response.meta['crawlid']
     item['response_url'] = response.url
     return item
Пример #23
0
    def process_response(self, request, response, spider):

        if request.meta.get('dont_retry', False):
            return response
        if response.status in self.retry_http_codes:
            reason = response_status_message(response.status)
            logger.error("del proxy retry")
            try:
                del request.meta['proxy']
                return self._retry(request, reason, spider) or response
            except KeyError:
                return self._retry(request, reason, spider) or response
        return response
Пример #24
0
 def process_response(self,request,response,spider):
     if response.status in [400,403,404,429,500,502,503,504]:
         self.TIMES = 3
         logger.error("%s! error..." % response.status)
         #pdb.set_trace()
         try:
             updateIPPOOLS(self.rconn,request.meta['proxy'].replace('http://',''),request.meta['status'],-1)
         except:
             pass
         reason = response_status_message(response.status)
         return self._retry(request, reason, spider) or response  # 重试
     else:
         return response
 def process_response(self, request, response, spider):
     if 'dont_retry' in request.meta:
         return response
     if response.status in self.retry_http_codes:
         #recode exception time and suspend spider
         key = self.genKey()
         incAttr(self.status, key)
         if self.maxExceptionTime and self.status[
                 key] >= self.maxExceptionTime:
             time.sleep(self.suspendTime)
         reason = response_status_message(response.status)
         return self._retry(request, reason, spider) or response
     return response
Пример #26
0
 def process_response(self, request, response, spider):
     print("返回码:")
     print(response.status)
     if request.meta.get('dont_retry', False):
         return response
     if response.status in self.retry_http_codes:
         reason = response_status_message(response.status)
         # 删除代理
         print("重试中间件:")
         print(reason)
         self.delete_proxy(request.meta.get('proxy', False))
         return self._retry(request, reason, spider) or response
     return response
Пример #27
0
 def process_response(self, request, response, spider):
     if response.status in [200]:
         return response
     elif response.status in [300, 301, 302, 303]:
         try:
             redirect_url = bytes.decode(response.headers["location"])
             if "/service/captcha" in redirect_url:
                 print('a' * 30)
                 reason = response_status_message(response.status)
                 return self._retry(request, reason, spider) or response
             else:
                 reason = response_status_message(response.status)
                 return self._retry(request, reason, spider) or response
         except Exception as e:
             raise IgnoreRequest
     elif response.status in [403, 414, 400]:
         code = request.meta['code']
         self.cookie.upDateCookie(code)
         reason = response_status_message(response.status)
         return self._retry(request, reason, spider) or response
     else:
         return response
Пример #28
0
 def process_response(self, request, response, spider):
     print(response.text)
     if request.meta.get('dont_retry', False):
         return response
     print('我是新的')
     if response.status in self.retry_http_codes or response.status != 200:
         reason = response_status_message(response.status)
         self.lock.acquire()
         self.proxy = get_proxy()
         print('ip我是重写的', self.proxy)
         self.logger.warning('返回值异常, 进行重试...')
         return self._retry(request, reason, spider) or response
     return response
Пример #29
0
    def process_response(self, request, response, spider):
        if request.meta.get("dont_retry", False):
            return response

        if response.status in self.retry_http_codes:
            self.loger.info(
                "request url: %s ,response status %s, max try time:%s , have done: %s"
                % (request.url, response.status, self.max_retry_times,
                   request.meta.get("retry_times", 0)))

            reason = response_status_message(response.status)
            return self._retry(request, reason, spider) or request
        return response
Пример #30
0
 def process_response(self, request, response, spider):
     if request.meta.get('dont_retry', False):
         return response
     if response.status in self.retry_http_codes:
         reason = response_status_message(response.status)
         if response.status == 403:
             # request.meta['dont_merge_cookies'] = True
             spider.logger.error("Response 403 Retry, " +
                                 log_simple_response(response))
             spider.logger.error("403 Retry Set dont_merge_cookies True, " +
                                 log_simple_request(request))
         return self._retry(request, reason, spider) or response
     return response
Пример #31
0
 def process_response(self, request, response, spider):
     has_proxy = 'proxy' in request.meta
     if 'dont_retry' in request.meta:
         return response
     if response.status in self.retry_http_codes:
         reason = response_status_message(response.status)
         if has_proxy:
             self.proxy_ev.inc_failure(request.meta['proxy'])
         return self._retry(request, reason, spider) or response
     else:  # Response was succesful
         if has_proxy:
             self.proxy_ev.inc_successes(request.meta['proxy'])
     return response
Пример #32
0
 def errback_httpbin(self, failure):
     request = failure.request
     if failure.check(HttpError):
         response = failure.value.response
         errmsg = 'errback <%s> %s , response status:%s' % (
             request.url, failure.value,
             response_status_message(response.status))
         # self.err_after(request.meta, True)
         self.save_error_log(
             json.dumps({
                 'meta': request.meta,
                 "errmsg": errmsg
             }))
     elif failure.check(ResponseFailed):
         errmsg = 'errback <%s> ResponseFailed' % request.url
         # self.err_after(request.meta, True)
         self.save_error_log(
             json.dumps({
                 'meta': request.meta,
                 "errmsg": errmsg
             }))
     elif failure.check(ConnectionRefusedError):
         errmsg = 'errback <%s> ConnectionRefusedError' % request.url
         # self.err_after(request.meta, True)
         self.save_error_log(
             json.dumps({
                 'meta': request.meta,
                 "errmsg": errmsg
             }))
     elif failure.check(ResponseNeverReceived):
         errmsg = 'errback <%s> ResponseNeverReceived' % request.url
         # self.err_after(request.meta, False)
         self.save_error_log(
             json.dumps({
                 'meta': request.meta,
                 "errmsg": errmsg
             }))
     elif failure.check(TCPTimedOutError, TimeoutError):
         errmsg = 'errback <%s> TimeoutError' % request.url
         self.save_error_log(
             json.dumps({
                 'meta': request.meta,
                 "errmsg": errmsg
             }))
     else:
         errmsg = 'errback <%s> OtherError' % request.url
         self.save_error_log(
             json.dumps({
                 'meta': request.meta,
                 'errmsg': errmsg,
             }))
Пример #33
0
 def process_response(self, request, response, spider):
     if request.meta.get('dont_retry', False):
         return response
     if response.status in self.retry_http_codes:
         reason = response_status_message(response.status)
         if response.status == 404 and isinstance(spider, SonglistSpider):
             spider.current_music += 1
             next_request = request.replace(
                 url='https://music.douban.com/subject/' +
                 str(spider.musiclist[spider.current_music]) + '/', )
             return self._retry(next_request, reason, spider) or response
         time.sleep(10)
         return self._retry(request, reason, spider) or response
     return response
Пример #34
0
 def process_response(self, request, response, spider):
     logger.info('[' + str(response.status) + '] ' + response.url + ' proxy:' + str(request.meta.get('proxy', '')))
     if request.meta.get('dont_retry', False):
         logger.info("request.meta.get('dont_retry') is set to 'True'. No need to retry.")
         return response
     # proxy = request.meta.get('proxy', None)
     # if response.status in [403, 404] and proxy in XHProxyMiddleware.proxy_list:
     #     logger.info('Remove proxy due to Http-Error: %s [%s]' % (proxy, str(len(XHProxyMiddleware.proxy_list))))
     #     XHProxyMiddleware.proxy_list.remove(proxy)
     if response.status in self.retry_http_codes:
         logger.info("Retry from XHRetryMiddleware.process_response: " + request.url)
         reason = response_status_message(response.status)
         return self._retry(request, reason, spider) or response
     return response
Пример #35
0
 def process_response(self, request, response, spider):
     # if request.meta.get('dont_retry',False):
     #     return response
     # if response.status in self.retry_http_codes:
     if response.status != 200:
         self.logger('状态码 %s 异常' % response.status)
         max_retry_times = len(IpProxy.ips)
         reason = response_status_message(response.status)
         ip = request.meta['proxy']
         request.meta['proxy'] = self.resetip(ip)
         request.meta['max_retry_times'] = max_retry_times
         self.logger('ip %s 替换为: %s 最大重连次数为: %s'%(ip, request.meta['proxy'], max_retry_times))
         return self._retry(request, reason, spider) or response
     return response
Пример #36
0
 def process_response(self, request, response, spider):
     if request.meta.get('dont_retry', False):
         return response
     if response.status in self.retry_http_codes:
         reason = response_status_message(response.status)
         return self._retry(request, reason, spider) or response
     if spider.name == 'detail_xml_redis_spider':
         data = dict(xmltodict.parse(response.text))
         data = data.get('GetItemResponse')
         if 'Ack' not in data.keys() or data.get('Ack') == 'Failure':
             print(data['Ack'])
             spider.logger.info(data['Ack'])
             return self._retry(request, 'Ack Error', spider) or response
     return response
Пример #37
0
 def process_response(self, request, response, spider):
     if request.meta.get('dont_retry', False):
         return response
     elif response.status == 429:
         self.crawler.engine.close_spider(spider)
         # self.crawler.engine.pause()
         # time.sleep(60) # If the rate limit is renewed in a minute, put 60 seconds, and so on.
         # self.crawler.engine.unpause()
         # reason = response_status_message(response.status)
         # return self._retry(request, reason, spider) or response
     elif response.status in self.retry_http_codes:
         reason = response_status_message(response.status)
         return self._retry(request, reason, spider) or response
     return response
Пример #38
0
 def process_response(self, request, response, spider):
     reason = response_status_message(response.status)
     if response.status in [300, 301, 302, 303]:
         if reason == '301 Moved Permanently':
             return self._retry(request, reason, spider) or response
         else:
             raise IgnoreRequest
     elif response.status in [403, 414]:
         logger.error("%s! Stopping..." % response.status)
         os.system("pause")
         update_cookie(request.meta['account_text'], self.rconn, spider.name, request.cookies)
         return self._retry(request, reason, spider) or response  # 重试
     else:
         return response
Пример #39
0
    def process_response(self, request, response, spider):
        if request.meta.get('dont_retry', False):
            return response
        if response.status in self.retry_http_codes:
            reason = response_status_message(response.status)
            return self._retry(request, reason, spider) or response

        # retry when response's content is empty, response's status is 200
        # and request meta set empty_body False
        empty_body = request.meta.get('empty_body', False)
        if not empty_body and not response.body and response.status == 200:
            reason = '200 empty_response_body'
            return self._retry(request, reason, spider) or response
        return response
Пример #40
0
 def process_response(self, request, response, spider):
     # 重试
     if response.status in self.retry_http_codes:
         reason = response_status_message(response.status)
         # 在此处进行自己的操作,如删除不可用代理,打日志等
         return self._retry(request, reason, spider) or response
     # 捕获状态码为40x/50x的response
     if str(response.status).startswith('4') or str(
             response.status).startswith('5'):
         # 随意封装,直接返回response,spider代码中根据url==''来处理response
         response = HtmlResponse(url='4050')
         return response
     # 其他状态码不处理
     return response
Пример #41
0
    def process_response(self, request, response, spider):
        logger.info('process_response function processed http code %s',
                    response.status)
        if request.meta.get('dont_retry', False):
            return response
        if response.status in self.retry_http_codes:
            reason = response_status_message(response.status)
            return self._retry(request, reason, spider) or response
        if response.status == 500:
            BizdirectoryDownloaderMiddleware.counter = BizdirectoryDownloaderMiddleware.counter + 1
        if BizdirectoryDownloaderMiddleware.counter > 6:
            raise CloseSpider('Too Many Error:500')

        return response
Пример #42
0
 def process_response(self, request, response, spider):
     # 判断cookie是否有效
     if not request.url.endswith('.jpg'):
         status = response.xpath("./*//i[@class='i-notification mt9 mb9']").extract_first(default=None)
         if status == None:
             print('更换Cookie')
             p = Process(target=web_driver_login)
             p.start()
             p.join()
             print('更换成功')
             reason = response_status_message(response.status)
             return self._retry(request, reason, spider) or response  # 重试
         return response
     return response
Пример #43
0
    def handle_error(self, failure):
        """Handle an error due to a non-success status code or other reason.

        If link checking is enabled, saves the broken URL and referrers.
        """
        try:
            logging.info("Handle error response status code: {}".format(failure.value.response))
            logging.info("Url that failed: {}".format(
                failure.value.response.request.url))
        except:
            logging.error("Could not print handle error status code.")

        # If we should not do link check or failure is ignore request
        # and it is not a http error we know it is a last-modified check.
        if (not self.scanner.scan_object.do_link_check or
                (isinstance(failure.value, IgnoreRequest) and not isinstance(
                    failure.value, HttpError))):
            logging.info("We do not do link check or failure is an instance of "
                         "IgnoreRequest: {}".format(failure.value))
            return

        if hasattr(failure.value, "response"):
            response = failure.value.response
            url = response.request.url
            status_code = response.status
            status_message = response_status_message(status_code)

            if "redirect_urls" in response.request.meta:
                # Set URL to the original URL, not the URL after redirection
                url = response.request.meta["redirect_urls"][0]

            referer_header = response.request.headers.get("referer", None)
        else:
            url = failure.request.url
            status_code = -1
            status_message = "%s" % failure.value
            referer_header = None

        broken_url = self.broken_url_save(status_code, status_message, url)

        self.broken_url_objects[url] = broken_url

        # Associate referer using referer heade
        if referer_header is not None:
            self.associate_url_referrer(referer_header, broken_url)

        self.associate_url_referrers(broken_url)
Пример #44
0
 def process_response(self, request, response, spider):
     if response.status in [300, 301, 302, 303]:
         try:
             redirect_url = response.headers["location"]
             if "login.weibo" in redirect_url or "login.sina" in redirect_url:  # Cookie失效
                 logger.warning("One Cookie need to be updating...")
                 updateCookie(request.meta['accountText'], self.rconn, spider.name)
             elif "weibo.cn/security" in redirect_url:  # 账号被限
                 logger.warning("One Account is locked! Remove it!")
                 removeCookie(request.meta["accountText"], self.rconn, spider.name)
             elif "weibo.cn/pub" in redirect_url:
                 logger.warning(
                     "Redirect to 'http://weibo.cn/pub'!( Account:%s )" % request.meta["accountText"].split("--")[0])
             reason = response_status_message(response.status)
             return self._retry(request, reason, spider) or response  # 重试
         except Exception, e:
             raise IgnoreRequest
Пример #45
0
    def process_response(self, request, response, spider):
        if request.meta.get('dont_retry', False):
            return response

        if response.status in self.retry_http_codes:
            reason = response_status_message(response.status)
            return self._retry(request, reason, spider) or response

        # customiz' here
        # content = response.text
        if not response.xpath('//table[@class="list_2_tab"]/tbody/tr'):

            proxy = self.get_proxy()
            logging.info('>>>>>>>> 替换代理重试')
            request.meta['proxy']=proxy

            return self._retry(request, response.body, spider) or response

        return response
Пример #46
0
 def process_response(self, request, response, spider):
     # If the request meta dict has a fallback_url property... (only the
     # first original_img_url request has the fallback_url property)
     if request.meta.get('fallback_url') is not None:
         # If the response returned a status code that requires us to use
         # the fallback_url...
         if response.status in self.fallback_http_codes:
             reason = response_status_message(response.status)
             log.msg(format="Trying fallback for %(request)s (fallbackurl is %(fallback_url)s): %(reason)s",
                     level=log.DEBUG, spider=spider, request=request, fallback_url=request.meta.get('fallback_url'), reason=reason)
             return Request(request.meta.get('fallback_url'))
         else:
             # Try to open the image data to check it's valid.
             try:
                 im = Image.open(StringIO(response.body))
             except IOError as e:
                 # use fallback_url if image can't be opened.
                 log.msg(format="Trying fallback for %(request)s (fallbackurl is %(fallback_url)s) because image could not be opened: %(reason)s",
                     level=log.DEBUG, spider=spider, request=request, fallback_url=request.meta.get('fallback_url'), reason=e)
                 return Request(request.meta.get('fallback_url'))
     return response
 def process_response(self,request,response,spider):
     if response.status != 200:
         reason = response_status_message(response.status)
         return self._retry(request, reason, spider)
     return response
Пример #48
0
 def test_response_status_message(self):
     self.assertEqual(response_status_message(200), '200 OK')
     self.assertEqual(response_status_message(404), '404 Not Found')
     self.assertEqual(response_status_message(573), "573 Unknown Status")
Пример #49
0
    def process_response(self, request, response, spider):
        if response.status in self.forbidden_http_codes:
            reason = response_status_message(response.status)
            self._forbidden(request, reason, spider)

        return response