Пример #1
0
 def test_get_meta_refresh(self):
     r1 = HtmlResponse("http://www.example.com", body="""
     <html>
     <head><title>Dummy</title><meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head>
     <body>blahablsdfsal&amp;</body>
     </html>""")
     r2 = HtmlResponse("http://www.example.com", body="""
     <html>
     <head><title>Dummy</title><noScript>
     <meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head>
     </noSCRIPT>
     <body>blahablsdfsal&amp;</body>
     </html>""")
     r3 = HtmlResponse("http://www.example.com", body="""
 <noscript><meta http-equiv="REFRESH" content="0;url=http://www.example.com/newpage</noscript>
 <script type="text/javascript">
 if(!checkCookies()){
     document.write('<meta http-equiv="REFRESH" content="0;url=http://www.example.com/newpage">');
 }
 </script>
     """)
     self.assertEqual(
         get_meta_refresh(r1), (5.0, 'http://example.org/newpage'))
     self.assertEqual(get_meta_refresh(r2), (None, None))
     self.assertEqual(get_meta_refresh(r3), (None, None))
Пример #2
0
 def test_get_meta_refresh(self):
     r1 = HtmlResponse("http://www.example.com",
                       body=b"""
     <html>
     <head><title>Dummy</title><meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head>
     <body>blahablsdfsal&amp;</body>
     </html>""")
     r2 = HtmlResponse("http://www.example.com",
                       body=b"""
     <html>
     <head><title>Dummy</title><noScript>
     <meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head>
     </noSCRIPT>
     <body>blahablsdfsal&amp;</body>
     </html>""")
     r3 = HtmlResponse("http://www.example.com",
                       body=b"""
 <noscript><meta http-equiv="REFRESH" content="0;url=http://www.example.com/newpage</noscript>
 <script type="text/javascript">
 if(!checkCookies()){
     document.write('<meta http-equiv="REFRESH" content="0;url=http://www.example.com/newpage">');
 }
 </script>
     """)
     self.assertEqual(get_meta_refresh(r1),
                      (5.0, 'http://example.org/newpage'))
     self.assertEqual(get_meta_refresh(r2), (None, None))
     self.assertEqual(get_meta_refresh(r3), (None, None))
Пример #3
0
    def process_response(self, request, response, spider):
        url = response.url

        if response.status in [301, 307]:
            log.msg("trying to redirect us: %s" % url, level=log.INFO)
            reason = 'redirect %d' % response.status

            return self._retry(request, reason, spider) or response
        interval, redirect_url = get_meta_refresh(response)
        # handle meta redirect

        if redirect_url:
            log.msg("trying to redirect us: %s" % url, level=log.INFO)
            reason = 'meta'

            return self._retry(request, reason, spider) or response

        hxs = HtmlXPathSelector(response)
        # test for captcha page
        captcha = hxs.select(
            ".//input[contains(@id, 'captchacharacters')]").extract()

        if captcha:
            log.msg("captcha page %s" % url, level=log.INFO)
            reason = 'capcha'

            return self._retry(request, reason, spider) or response

        return response
Пример #4
0
    def process_response(self, request, response, spider):
        if 'dont_redirect' in request.meta:
            return response
        if request.method.upper() == 'HEAD':
            if response.status in [301, 302, 303, 307] and 'Location' in response.headers:
                redirected_url = urljoin(request.url, response.headers['location'])
                redirected = request.replace(url=redirected_url)
                return self._redirect(redirected, request, spider, response.status)
            else:
                return response

        if response.status in [302, 303] and 'Location' in response.headers:
            redirected_url = urljoin(request.url, response.headers['location'])
            redirected = self._redirect_request_using_get(request, redirected_url)
            return self._redirect(redirected, request, spider, response.status)

        if response.status in [301, 307] and 'Location' in response.headers:
            redirected_url = urljoin(request.url, response.headers['location'])
            redirected = request.replace(url=redirected_url)
            return self._redirect(redirected, request, spider, response.status)

        if isinstance(response, HtmlResponse):
            interval, url = get_meta_refresh(response)
            if url and interval < self.max_metarefresh_delay:
                redirected = self._redirect_request_using_get(request, url)
                return self._redirect(redirected, request, spider, 'meta refresh')

        return response
Пример #5
0
    def process_response(self, request, response, spider):
        if 'dont_redirect' in request.meta:
            return response
        if request.method.upper() == 'HEAD':
            if response.status in [301, 302, 303, 307
                                   ] and 'Location' in response.headers:
                redirected_url = urljoin(request.url,
                                         response.headers['location'])
                redirected = request.replace(url=redirected_url)
                return self._redirect(redirected, request, spider,
                                      response.status)
            else:
                return response

        if response.status in [302, 303] and 'Location' in response.headers:
            redirected_url = urljoin(request.url, response.headers['location'])
            redirected = self._redirect_request_using_get(
                request, redirected_url)
            return self._redirect(redirected, request, spider, response.status)

        if response.status in [301, 307] and 'Location' in response.headers:
            redirected_url = urljoin(request.url, response.headers['location'])
            redirected = request.replace(url=redirected_url)
            return self._redirect(redirected, request, spider, response.status)

        if isinstance(response, HtmlResponse):
            interval, url = get_meta_refresh(response)
            if url and interval < self.max_metarefresh_delay:
                redirected = self._redirect_request_using_get(request, url)
                return self._redirect(redirected, request, spider,
                                      'meta refresh')

        return response
Пример #6
0
    def _check_redirect(self, request, response):
        if request.method.upper() == 'HEAD':
            if response.status in [301, 302, 303, 307] and 'Location' in response.headers:
                redirected_url = urljoin_rfc(request.url, response.headers['location'])
                redirected = request.replace(url=redirected_url)
                return redirected,response.status
            else:
                return None,None

        if response.status in [302, 303] and 'Location' in response.headers:
            redirected_url = urljoin_rfc(request.url, response.headers['location'])
            redirected = self._redirect_request_using_get(request, redirected_url)
            return redirected,response.status

        if response.status in [301, 307] and 'Location' in response.headers:
            redirected_url = urljoin_rfc(request.url, response.headers['location'])
            redirected = request.replace(url=redirected_url)
            return redirected,response.status

        if  request.meta.get('meta_refresh',None) and  \
            isinstance(response, HtmlResponse):
            interval, url = get_meta_refresh(response)
            if url and interval < self.max_metarefresh_delay:
                redirected = self._redirect_request_using_get(request, url)
                return redirected,'meta refresh'

        return None,None
 def process_response(self, request, response, spider):
     get_list = {}
     obj = DmozSpider()
     redirect_filename = 'output/' + obj.redirect_filename
     from_url = request.url
     to_url = ''
     if response.status in [301,302,303, 307] and "Location" in response.headers:
         to_url = response.headers["Location"]
         get_list = self.extractgetList(from_url)
         redirect_dict = {"from_url":from_url,"params":get_list,"to_url":to_url}
         f = open(redirect_filename,'a')
         json.dump(redirect_dict,f,indent = 2)
         f.write(",")
         f.close()
         log.msg("trying to redirect : %s -> %s" %(from_url,to_url), level=log.INFO)
         reason = 'redirect %d' %response.status
         return self._retry(request, reason, spider) or response
     interval, redirect_url = get_meta_refresh(response)
     # handle meta redirect
     if redirect_url:
         get_list = self.extractgetList(to_url)
         redirect_dict = {"from_url":from_url,"params":get_list,"to_url":to_url}
         f = open(redirect_filename,'a')
         json.dump(redirect_dict,f,indent = 2)
         f.write(",")
         f.close()
         log.msg("trying to redirect : %s -> %s" %(from_url,to_url), level=log.INFO)
         reason = 'meta'
         return self._retry(request, reason, spider) or response
     return response
Пример #8
0
    def process_response(self, request, response, spider):

        if request.meta.get("meta_refresh"):
            # logger.debug("local meta redirect middlewares: {}".format(response.url))
            _, location_url = get_meta_refresh(response)

            if not location_url:
                raise IgnoreRequest

            for off_key in off_keys:
                if off_key in location_url:
                    # ignore the page
                    raise IgnoreRequest

            if location_url.startswith("http"):
                reason = "local pan middlewares, meta redirected!!!"
                request.meta["meta_refresh"] = False
                request.headers.pop('Content-Type', None)
                request.headers.pop("Referer", None)
                request.headers.pop('Content-Length', None)
                request.priority += 100
                redirected = request.replace(url=location_url)
                return self._redirect(redirected, request, spider,
                                      reason) or response

        return response
Пример #9
0
    def _check_redirect(self, request, response):
        if request.method.upper() == 'HEAD':
            if response.status in [301, 302, 303, 307
                                   ] and 'Location' in response.headers:
                redirected_url = urljoin_rfc(request.url,
                                             response.headers['location'])
                redirected = request.replace(url=redirected_url)
                return redirected, response.status
            else:
                return None, None

        if response.status in [302, 303] and 'Location' in response.headers:
            redirected_url = urljoin_rfc(request.url,
                                         response.headers['location'])
            redirected = self._redirect_request_using_get(
                request, redirected_url)
            return redirected, response.status

        if response.status in [301, 307] and 'Location' in response.headers:
            redirected_url = urljoin_rfc(request.url,
                                         response.headers['location'])
            redirected = request.replace(url=redirected_url)
            return redirected, response.status

        if  request.meta.get('meta_refresh',None) and  \
            isinstance(response, HtmlResponse):
            interval, url = get_meta_refresh(response)
            if url and interval < self.max_metarefresh_delay:
                redirected = self._redirect_request_using_get(request, url)
                return redirected, 'meta refresh'

        return None, None
Пример #10
0
	def process_response(self, request, response, spider):
		url = response.url
		response
		with open('redirects.txt', 'a+') as f:
			if response.status in [301, 307]:
				f.write("trying to redirect us: " + url + '\n')
				f.write('redirect %d' + str(response.status) + '\n')
				reason = 'redirect %d' %response.status 
				return self._retry(request, reason, spider) or response
			interval, redirect_url = get_meta_refresh(response)
			# handle meta redirect
			if redirect_url:
				f.write("trying to redirect us: " + url + '\n')
				f.write('redirect meta' + '\n')
				reason = 'meta'
				return self._retry(request, reason, spider) or response
			hxs = HtmlXPathSelector(response)
			# test for captcha page
			captcha = hxs.select(".//input[contains(@id, 'captchacharacters')]").extract()
			if captcha:
				f.write("trying to redirect us: " + url + '\n')
				f.write('redirect capcha' + '\n')
				reason = 'capcha'
				return self._retry(request, reason, spider) or response
		return response
Пример #11
0
 def process_response(self, request, response, spider):
     url = response.url
     if response.status in [301, 307]:
         log.msg("trying to redirect us: %s" %url, level=log.INFO)
         reason = 'redirect %d' %response.status
         return self._retry(request, reason, spider) or response
     interval, redirect_url = get_meta_refresh(response)
     if response.status in [502, 503]:
         log.msg("Service Unavailable: %s" %url, level=log.INFO)
         reason = 'Possible block with 5xx error'
         return self._retry(request, reason, spider) or response
     # handle meta redirect
     if redirect_url:
         log.msg("trying to redirect us: %s" %url, level=log.INFO)
         reason = 'meta'
         return self._retry(request, reason, spider) or response
     hxs = HtmlXPathSelector(response)
     # test for captcha page
     captcha = hxs.select(".//input[contains(@id, 'captchacharacters')]").extract()
     if captcha or "Dostęp zablokowany" in response.text:
         log.msg("captcha page %s" %url, level=log.INFO)
         reason = 'captcha'
         return self._retry(request, reason, spider) or response
     if response.status in [502, 503, 504]:
         log.msg("Captcha with status: \n%s" %response.text, level=log.INFO)
         reason = 'captcha'
         return self._retry(request, reason, spider) or response
     return response
Пример #12
0
    def process_response(self, request, response, spider):
        if request.meta.get('dont_redirect', False) or request.method == 'HEAD' or \
                not isinstance(response, HtmlResponse):
            return response

        interval, url = get_meta_refresh(response)
        if url and interval < self._maxdelay:
            redirected = self._redirect_request_using_get(request, url)
            return self._redirect(redirected, request, spider, 'meta refresh')

        return response
Пример #13
0
    def process_response(self, request, response, spider):
        if request.meta.get('dont_redirect', False) or request.method == 'HEAD' or \
                not isinstance(response, HtmlResponse):
            return response

        interval, url = get_meta_refresh(response)
        if url and interval < self._maxdelay:
            redirected = self._redirect_request_using_get(request, url)
            return self._redirect(redirected, request, spider, 'meta refresh')

        return response
Пример #14
0
    def process_response(self, request, response, spider):
        if "dont_redirect" in request.meta or request.method == "HEAD" or not isinstance(response, HtmlResponse):
            return response

        if isinstance(response, HtmlResponse):
            interval, url = get_meta_refresh(response)
            if url and interval < self._maxdelay:
                redirected = self._redirect_request_using_get(request, url)
                return self._redirect(redirected, request, spider, "meta refresh")

        return response
Пример #15
0
 def process_response(self, request, response, spider):
     url = response.url
     if response.status in [301, 307]:
         log.msg("trying to redirect us: %s" %url, level=log.INFO)
         reason = 'redirect %d' %response.status
         return self._retry(request, reason, spider) or response
     interval, redirect_url = get_meta_refresh(response)
     # handle meta redirect
     if redirect_url:
         log.msg("trying to redirect us: %s" %url, level=log.INFO)
         reason = 'meta'
         return self._retry(request, reason, spider) or response
Пример #16
0
 def process_response(self, request, response, spider):
     url = response.url
     if response.status in [301, 307]:
         log.msg("trying to redirect us: %s" % url, level=log.INFO)
         reason = 'redirect %d' % response.status
         return self._retry(request, reason, spider) or response
     interval, redirect_url = get_meta_refresh(response)
     # handle meta redirect
     if redirect_url:
         log.msg("trying to redirect us: %s" % url, level=log.INFO)
         reason = 'meta'
         return self._retry(request, reason, spider) or response
Пример #17
0
    def process_response(self, request, response, spider):
        if (request.meta.get("dont_redirect", False)
                or request.method == "HEAD"
                or not isinstance(response, HtmlResponse)):
            return response

        interval, url = get_meta_refresh(response,
                                         ignore_tags=self._ignore_tags)
        if url and interval < self._maxdelay:
            redirected = self._redirect_request_using_get(request, url)
            return self._redirect(redirected, request, spider, "meta refresh")

        return response
    def process_response(self, request, response, spider):
        if 'dont_redirect' in request.meta or request.method == 'HEAD' or \
                not isinstance(response, HtmlResponse):
            return response

        if isinstance(response, HtmlResponse):
            interval, url = get_meta_refresh(response)
            if url and interval < self._maxdelay:
                redirected = self._redirect_request_using_get(request, url)
   #             print response.body
                if (response.url.find("view") > 0):
                    self.print_conversation(response)
                return self._redirect(redirected, request, spider, 'meta refresh')

        return response
Пример #19
0
    def process_response(self, request, response, spider):
        if request.meta.get('dont_retry', False):
            return response
        if response.status in self.retry_http_codes:
            reason = response_status_message(response.status)
            return self._retry(request, reason, spider) or response
        interval, redirect_url= get_meta_refresh(response)
        if redirect_url:
            print('redirect_url',redirect_url,'*******','response.url',response.url)

        # this is your check

        if response.status == 403 and response.url:
            print('*********************response.url',response.url)
            return self._retry(request, 'response got xpath "{}"'.format(response.url), spider) or response
        return response
    def process_response(self, request, response, spider):
        request.meta['dont_filter'] = True
        if 'dont_redirect' in request.meta or request.method == 'HEAD' or \
                not isinstance(response, HtmlResponse) or request.meta.get('redirect_times') >= 1:
            request.meta['dont_redirect'] = True
            return response

        if isinstance(response, HtmlResponse):
            interval, url = get_meta_refresh(response)
            if url and interval < self._maxdelay:
                redirected = self._redirect_request_using_get(request, url)
                redirected.dont_filter = True
                return self._redirect(redirected, request, spider,
                                      'meta refresh')

        return response
Пример #21
0
    def process_response(self, request, response, spider):
        if response.status in [302, 303] and 'Location' in response.headers:
            redirected_url = urljoin_rfc(request.url, response.headers['location'])
            redirected = self._redirect_request_using_get(request, redirected_url)
            return self._redirect(redirected, request, spider, response.status)

        if response.status in [301, 307] and 'Location' in response.headers:
            redirected_url = urljoin_rfc(request.url, response.headers['location'])
            redirected = request.replace(url=redirected_url)
            return self._redirect(redirected, request, spider, response.status)

        interval, url = get_meta_refresh(response)
        if url and interval < self.max_metarefresh_delay:
            redirected = self._redirect_request_using_get(request, url)
            return self._redirect(redirected, request, spider, 'meta refresh')

        return response
 def process_response(self, request, response, spider):
     get_list = {}
     obj = DmozSpider()
     redirect_filename = 'output/' + obj.redirect_filename
     from_url = request.url
     to_url = ''
     if response.status in [301, 302, 303, 307
                            ] and "Location" in response.headers:
         to_url = response.headers["Location"]
         get_list = self.extractgetList(from_url)
         redirect_dict = {
             "from_url": from_url,
             "params": get_list,
             "to_url": to_url
         }
         f = open(redirect_filename, 'a')
         json.dump(redirect_dict, f, indent=2)
         f.write(",")
         f.close()
         log.msg("trying to redirect : %s -> %s" % (from_url, to_url),
                 level=log.INFO)
         reason = 'redirect %d' % response.status
         return self._retry(request, reason, spider) or response
     interval, redirect_url = get_meta_refresh(response)
     # handle meta redirect
     if redirect_url:
         get_list = self.extractgetList(to_url)
         redirect_dict = {
             "from_url": from_url,
             "params": get_list,
             "to_url": to_url
         }
         f = open(redirect_filename, 'a')
         json.dump(redirect_dict, f, indent=2)
         f.write(",")
         f.close()
         log.msg("trying to redirect : %s -> %s" % (from_url, to_url),
                 level=log.INFO)
         reason = 'meta'
         return self._retry(request, reason, spider) or response
     return response
Пример #23
0
 def process_response(self, request, response, spider):
     url = response.url
     if response.status in [301,302, 307]:
         log.msg("trying to redirect us: %s" %url, level=log.INFO)
         reason = 'redirect %d' %response.status
         return self._retry(request, reason, spider) or response
     interval, redirect_url = get_meta_refresh(response)
     # handle meta redirect
     if redirect_url:
         log.msg("trying to redirect us: %s" %url, level=log.INFO)
         reason = 'meta'
         return self._retry(request, reason, spider) or response
     print(response.url)
     userblocked = "userblocked" in response.url
     print(userblocked)
     # test for captcha page
     if userblocked:
         log.msg("blocked page %s" %url, level=log.INFO)
         reason = 'blocked'
         return self._retry(request, reason, spider) or response
     return response
 def process_response(self, request, response, spider):
     url = response.url
     logging.debug( 'WangJun I am OK111'	)	
     logging.debug( 'WangJun I am OK111 %d' %response.status	)
     print('Cookie',response)
     
     if response.status in [301, 302]:
         logging.info("WangJun trying to redirect us: %s" %url)
         reason = 'redirect %d' %response.status
         return self._retry(request, reason, spider) or response
     interval, redirect_url = get_meta_refresh(response)
     # handle meta redirect
     if redirect_url:
         logging.info("trying to redirect us: %s" %url)
         reason = 'meta'
         return self._retry(request, reason, spider) or response
     hxs = Selector(response) #HtmlXPathSelector(response)
     # test for captcha page
     captcha = hxs.xpath(".//input[contains(@id, 'captchacharacters')]").extract()
     if captcha:
         logging.info("captcha page %s" %url)
         reason = 'capcha'           
         return self._retry(request, reason, spider) or response
     return response
Пример #25
0
    def process_response(self, request, response, spider):
        # 跳过成功抓取的页面
        if response.status == 200:
            return response
        url = response.url
        if response.status in [301, 307]:
            #             req = urllib2.Request(url='http://imgs.co/u/16/10/06/JmQo.jpg')
            #             opener = urllib2.build_opener()
            #             reponse = opener.open(req)
            #             return reponse

            #             interval, redirect_url = get_meta_refresh(response)
            #             log.msg('redirect response.body:%s' %
            #                     response.body.extract(), level=log.INFO)
            #             url = request.meta['redirect_urls']
            #             url = 'https:' + url[5:]
            #             log.msg("response redirect_url: %s" % redirect_url, level=log.INFO)
            #             log.msg("request.headers: %s" % repr(request.headers))
            #             log.msg("request.body: %s" % repr(request.body))
            #             log.msg("response.headers: %s" % repr(response.headers))
            #             log.msg("response.request type: %s" %
            #                     type(response.request), level=log.INFO)
            #             log.msg("response type: %s" %
            #                     type(response), level=log.INFO)
            #             request.headers['Accept-Encoding'] = 'identity'
            #             request.headers['Host'] = 'imgs.co'
            #             request.headers['Connection'] = 'close'
            #             del request.headers['Accept-Language']
            #             del request.headers['Accept']
            # #             del request.headers['Cookie']
            #             request.meta['dont_merge_cookies'] = True

            #             """
            #
            #             'Accept-Language': ['en'],
            #             'Accept-Encoding': ['identity'],
            #             'Host': ['imgs.co'],
            #             'Accept': ['text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'],
            #             'User-Agent': ['Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0'], 'Connection': ['close'], 'Referer': ['http://c2.1024mx.org/pw/htm_data/3/1611/470074.html'],
            #             'Cookie': ['__cfduid=dc80e726a73461e3823f2a312eb2b34cd1478357543']
            #             """

            #             log.msg("trying to redirect us: %s" % url, level=log.INFO)
            url = response.headers['Location']
            logger.info("trying to redirect us: %s" % url)
            reason = 'redirect %d' % response.status
            #             return response.request
            return self._retry(request, reason, spider) or response


#         else:
#             return response

# get_meta_refresh方法导致 'Response' object has no attribute 'text'
# 下段暂时通过try进行包装处理

        try:
            # handle meta redirect
            interval, redirect_url = get_meta_refresh(response)
            if redirect_url:
                #             log.msg("trying to redirect us: %s" % url, level=log.INFO)
                logger.info("trying to redirect us: %s" % url)
                reason = 'meta'
                return self._retry(request, reason, spider) or response
        except Exception, e:
            print 'str(Exception):\t', str(Exception)
            print 'str(e):\t\t', str(e)
            print 'repr(e):\t', repr(e)
            print 'e.message:\t', e.message
            print 'traceback.print_exc():'
            traceback.print_exc()
            print 'traceback.format_exc():\n%s' % traceback.format_exc()
Пример #26
0
    def test_get_meta_refresh(self):
        body = """
            <html>
            <head><title>Dummy</title><meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head>
            <body>blahablsdfsal&amp;</body>
            </html>"""
        response = TextResponse(url='http://example.org', body=body)
        self.assertEqual(get_meta_refresh(response),
                         (5, 'http://example.org/newpage'))

        # refresh without url should return (None, None)
        body = """<meta http-equiv="refresh" content="5" />"""
        response = TextResponse(url='http://example.org', body=body)
        self.assertEqual(get_meta_refresh(response), (None, None))

        body = """<meta http-equiv="refresh" content="5;
            url=http://example.org/newpage" /></head>"""
        response = TextResponse(url='http://example.org', body=body)
        self.assertEqual(get_meta_refresh(response),
                         (5, 'http://example.org/newpage'))

        # meta refresh in multiple lines
        body = """<html><head>
               <META
               HTTP-EQUIV="Refresh"
               CONTENT="1; URL=http://example.org/newpage">"""
        response = TextResponse(url='http://example.org', body=body)
        self.assertEqual(get_meta_refresh(response),
                         (1, 'http://example.org/newpage'))

        # entities in the redirect url
        body = """<meta http-equiv="refresh" content="3; url=&#39;http://www.example.com/other&#39;">"""
        response = TextResponse(url='http://example.com', body=body)
        self.assertEqual(get_meta_refresh(response),
                         (3, 'http://www.example.com/other'))

        # relative redirects
        body = """<meta http-equiv="refresh" content="3; url=other.html">"""
        response = TextResponse(url='http://example.com/page/this.html',
                                body=body)
        self.assertEqual(get_meta_refresh(response),
                         (3, 'http://example.com/page/other.html'))

        # non-standard encodings (utf-16)
        body = """<meta http-equiv="refresh" content="3; url=http://example.com/redirect">"""
        body = body.decode('ascii').encode('utf-16')
        response = TextResponse(url='http://example.com',
                                body=body,
                                encoding='utf-16')
        self.assertEqual(get_meta_refresh(response),
                         (3, 'http://example.com/redirect'))

        # non-ascii chars in the url (utf8)
        body = """<meta http-equiv="refresh" content="3; url=http://example.com/to\xc2\xa3">"""
        response = TextResponse(url='http://example.com',
                                body=body,
                                encoding='utf-8')
        self.assertEqual(get_meta_refresh(response),
                         (3, 'http://example.com/to%C2%A3'))

        # non-ascii chars in the url (latin1)
        body = """<meta http-equiv="refresh" content="3; url=http://example.com/to\xa3">"""
        response = TextResponse(url='http://example.com',
                                body=body,
                                encoding='latin1')
        self.assertEqual(get_meta_refresh(response),
                         (3, 'http://example.com/to%C2%A3'))

        # responses without refresh tag should return None None
        response = TextResponse(url='http://example.org')
        self.assertEqual(get_meta_refresh(response), (None, None))
        response = TextResponse(url='http://example.org')
        self.assertEqual(get_meta_refresh(response), (None, None))

        # html commented meta refresh header must not directed
        body = """<!--<meta http-equiv="refresh" content="3; url=http://example.com/">-->"""
        response = TextResponse(url='http://example.com', body=body)
        self.assertEqual(get_meta_refresh(response), (None, None))

        # html comments must not interfere with uncommented meta refresh header
        body = """<!-- commented --><meta http-equiv="refresh" content="3; url=http://example.com/">-->"""
        response = TextResponse(url='http://example.com', body=body)
        self.assertEqual(get_meta_refresh(response),
                         (3, 'http://example.com/'))

        # float refresh intervals
        body = """<meta http-equiv="refresh" content=".1;URL=index.html" />"""
        response = TextResponse(url='http://example.com', body=body)
        self.assertEqual(get_meta_refresh(response),
                         (0.1, 'http://example.com/index.html'))

        body = """<meta http-equiv="refresh" content="3.1;URL=index.html" />"""
        response = TextResponse(url='http://example.com', body=body)
        self.assertEqual(get_meta_refresh(response),
                         (3.1, 'http://example.com/index.html'))
Пример #27
0
    def test_get_meta_refresh(self):
        body = """
            <html>
            <head><title>Dummy</title><meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head>
            <body>blahablsdfsal&amp;</body>
            </html>"""
        response = TextResponse(url='http://example.org', body=body)
        self.assertEqual(get_meta_refresh(response), (5, 'http://example.org/newpage'))

        # refresh without url should return (None, None)
        body = """<meta http-equiv="refresh" content="5" />"""
        response = TextResponse(url='http://example.org', body=body)
        self.assertEqual(get_meta_refresh(response), (None, None))

        body = """<meta http-equiv="refresh" content="5;
            url=http://example.org/newpage" /></head>"""
        response = TextResponse(url='http://example.org', body=body)
        self.assertEqual(get_meta_refresh(response), (5, 'http://example.org/newpage'))

        # meta refresh in multiple lines
        body = """<html><head>
               <META
               HTTP-EQUIV="Refresh"
               CONTENT="1; URL=http://example.org/newpage">"""
        response = TextResponse(url='http://example.org', body=body)
        self.assertEqual(get_meta_refresh(response), (1, 'http://example.org/newpage'))

        # entities in the redirect url
        body = """<meta http-equiv="refresh" content="3; url=&#39;http://www.example.com/other&#39;">"""
        response = TextResponse(url='http://example.com', body=body)
        self.assertEqual(get_meta_refresh(response), (3, 'http://www.example.com/other'))

        # relative redirects
        body = """<meta http-equiv="refresh" content="3; url=other.html">"""
        response = TextResponse(url='http://example.com/page/this.html', body=body)
        self.assertEqual(get_meta_refresh(response), (3, 'http://example.com/page/other.html'))

        # non-standard encodings (utf-16)
        body = """<meta http-equiv="refresh" content="3; url=http://example.com/redirect">"""
        body = body.decode('ascii').encode('utf-16')
        response = TextResponse(url='http://example.com', body=body, encoding='utf-16')
        self.assertEqual(get_meta_refresh(response), (3, 'http://example.com/redirect'))

        # non-ascii chars in the url (default encoding - utf8)
        body = """<meta http-equiv="refresh" content="3; url=http://example.com/to\xc2\xa3">"""
        response = TextResponse(url='http://example.com', body=body)
        self.assertEqual(get_meta_refresh(response), (3, 'http://example.com/to%C2%A3'))

        # non-ascii chars in the url (custom encoding - latin1)
        body = """<meta http-equiv="refresh" content="3; url=http://example.com/to\xa3">"""
        response = TextResponse(url='http://example.com', body=body, encoding='latin1')
        self.assertEqual(get_meta_refresh(response), (3, 'http://example.com/to%C2%A3'))

        # responses without refresh tag should return None None
        response = TextResponse(url='http://example.org')
        self.assertEqual(get_meta_refresh(response), (None, None))
        response = TextResponse(url='http://example.org')
        self.assertEqual(get_meta_refresh(response), (None, None))