def test_get_meta_refresh(self): r1 = HtmlResponse("http://www.example.com", body=""" <html> <head><title>Dummy</title><meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head> <body>blahablsdfsal&</body> </html>""") r2 = HtmlResponse("http://www.example.com", body=""" <html> <head><title>Dummy</title><noScript> <meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head> </noSCRIPT> <body>blahablsdfsal&</body> </html>""") r3 = HtmlResponse("http://www.example.com", body=""" <noscript><meta http-equiv="REFRESH" content="0;url=http://www.example.com/newpage</noscript> <script type="text/javascript"> if(!checkCookies()){ document.write('<meta http-equiv="REFRESH" content="0;url=http://www.example.com/newpage">'); } </script> """) self.assertEqual( get_meta_refresh(r1), (5.0, 'http://example.org/newpage')) self.assertEqual(get_meta_refresh(r2), (None, None)) self.assertEqual(get_meta_refresh(r3), (None, None))
def test_get_meta_refresh(self): r1 = HtmlResponse("http://www.example.com", body=b""" <html> <head><title>Dummy</title><meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head> <body>blahablsdfsal&</body> </html>""") r2 = HtmlResponse("http://www.example.com", body=b""" <html> <head><title>Dummy</title><noScript> <meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head> </noSCRIPT> <body>blahablsdfsal&</body> </html>""") r3 = HtmlResponse("http://www.example.com", body=b""" <noscript><meta http-equiv="REFRESH" content="0;url=http://www.example.com/newpage</noscript> <script type="text/javascript"> if(!checkCookies()){ document.write('<meta http-equiv="REFRESH" content="0;url=http://www.example.com/newpage">'); } </script> """) self.assertEqual(get_meta_refresh(r1), (5.0, 'http://example.org/newpage')) self.assertEqual(get_meta_refresh(r2), (None, None)) self.assertEqual(get_meta_refresh(r3), (None, None))
def process_response(self, request, response, spider): url = response.url if response.status in [301, 307]: log.msg("trying to redirect us: %s" % url, level=log.INFO) reason = 'redirect %d' % response.status return self._retry(request, reason, spider) or response interval, redirect_url = get_meta_refresh(response) # handle meta redirect if redirect_url: log.msg("trying to redirect us: %s" % url, level=log.INFO) reason = 'meta' return self._retry(request, reason, spider) or response hxs = HtmlXPathSelector(response) # test for captcha page captcha = hxs.select( ".//input[contains(@id, 'captchacharacters')]").extract() if captcha: log.msg("captcha page %s" % url, level=log.INFO) reason = 'capcha' return self._retry(request, reason, spider) or response return response
def process_response(self, request, response, spider): if 'dont_redirect' in request.meta: return response if request.method.upper() == 'HEAD': if response.status in [301, 302, 303, 307] and 'Location' in response.headers: redirected_url = urljoin(request.url, response.headers['location']) redirected = request.replace(url=redirected_url) return self._redirect(redirected, request, spider, response.status) else: return response if response.status in [302, 303] and 'Location' in response.headers: redirected_url = urljoin(request.url, response.headers['location']) redirected = self._redirect_request_using_get(request, redirected_url) return self._redirect(redirected, request, spider, response.status) if response.status in [301, 307] and 'Location' in response.headers: redirected_url = urljoin(request.url, response.headers['location']) redirected = request.replace(url=redirected_url) return self._redirect(redirected, request, spider, response.status) if isinstance(response, HtmlResponse): interval, url = get_meta_refresh(response) if url and interval < self.max_metarefresh_delay: redirected = self._redirect_request_using_get(request, url) return self._redirect(redirected, request, spider, 'meta refresh') return response
def process_response(self, request, response, spider): if 'dont_redirect' in request.meta: return response if request.method.upper() == 'HEAD': if response.status in [301, 302, 303, 307 ] and 'Location' in response.headers: redirected_url = urljoin(request.url, response.headers['location']) redirected = request.replace(url=redirected_url) return self._redirect(redirected, request, spider, response.status) else: return response if response.status in [302, 303] and 'Location' in response.headers: redirected_url = urljoin(request.url, response.headers['location']) redirected = self._redirect_request_using_get( request, redirected_url) return self._redirect(redirected, request, spider, response.status) if response.status in [301, 307] and 'Location' in response.headers: redirected_url = urljoin(request.url, response.headers['location']) redirected = request.replace(url=redirected_url) return self._redirect(redirected, request, spider, response.status) if isinstance(response, HtmlResponse): interval, url = get_meta_refresh(response) if url and interval < self.max_metarefresh_delay: redirected = self._redirect_request_using_get(request, url) return self._redirect(redirected, request, spider, 'meta refresh') return response
def _check_redirect(self, request, response): if request.method.upper() == 'HEAD': if response.status in [301, 302, 303, 307] and 'Location' in response.headers: redirected_url = urljoin_rfc(request.url, response.headers['location']) redirected = request.replace(url=redirected_url) return redirected,response.status else: return None,None if response.status in [302, 303] and 'Location' in response.headers: redirected_url = urljoin_rfc(request.url, response.headers['location']) redirected = self._redirect_request_using_get(request, redirected_url) return redirected,response.status if response.status in [301, 307] and 'Location' in response.headers: redirected_url = urljoin_rfc(request.url, response.headers['location']) redirected = request.replace(url=redirected_url) return redirected,response.status if request.meta.get('meta_refresh',None) and \ isinstance(response, HtmlResponse): interval, url = get_meta_refresh(response) if url and interval < self.max_metarefresh_delay: redirected = self._redirect_request_using_get(request, url) return redirected,'meta refresh' return None,None
def process_response(self, request, response, spider): get_list = {} obj = DmozSpider() redirect_filename = 'output/' + obj.redirect_filename from_url = request.url to_url = '' if response.status in [301,302,303, 307] and "Location" in response.headers: to_url = response.headers["Location"] get_list = self.extractgetList(from_url) redirect_dict = {"from_url":from_url,"params":get_list,"to_url":to_url} f = open(redirect_filename,'a') json.dump(redirect_dict,f,indent = 2) f.write(",") f.close() log.msg("trying to redirect : %s -> %s" %(from_url,to_url), level=log.INFO) reason = 'redirect %d' %response.status return self._retry(request, reason, spider) or response interval, redirect_url = get_meta_refresh(response) # handle meta redirect if redirect_url: get_list = self.extractgetList(to_url) redirect_dict = {"from_url":from_url,"params":get_list,"to_url":to_url} f = open(redirect_filename,'a') json.dump(redirect_dict,f,indent = 2) f.write(",") f.close() log.msg("trying to redirect : %s -> %s" %(from_url,to_url), level=log.INFO) reason = 'meta' return self._retry(request, reason, spider) or response return response
def process_response(self, request, response, spider): if request.meta.get("meta_refresh"): # logger.debug("local meta redirect middlewares: {}".format(response.url)) _, location_url = get_meta_refresh(response) if not location_url: raise IgnoreRequest for off_key in off_keys: if off_key in location_url: # ignore the page raise IgnoreRequest if location_url.startswith("http"): reason = "local pan middlewares, meta redirected!!!" request.meta["meta_refresh"] = False request.headers.pop('Content-Type', None) request.headers.pop("Referer", None) request.headers.pop('Content-Length', None) request.priority += 100 redirected = request.replace(url=location_url) return self._redirect(redirected, request, spider, reason) or response return response
def _check_redirect(self, request, response): if request.method.upper() == 'HEAD': if response.status in [301, 302, 303, 307 ] and 'Location' in response.headers: redirected_url = urljoin_rfc(request.url, response.headers['location']) redirected = request.replace(url=redirected_url) return redirected, response.status else: return None, None if response.status in [302, 303] and 'Location' in response.headers: redirected_url = urljoin_rfc(request.url, response.headers['location']) redirected = self._redirect_request_using_get( request, redirected_url) return redirected, response.status if response.status in [301, 307] and 'Location' in response.headers: redirected_url = urljoin_rfc(request.url, response.headers['location']) redirected = request.replace(url=redirected_url) return redirected, response.status if request.meta.get('meta_refresh',None) and \ isinstance(response, HtmlResponse): interval, url = get_meta_refresh(response) if url and interval < self.max_metarefresh_delay: redirected = self._redirect_request_using_get(request, url) return redirected, 'meta refresh' return None, None
def process_response(self, request, response, spider): url = response.url response with open('redirects.txt', 'a+') as f: if response.status in [301, 307]: f.write("trying to redirect us: " + url + '\n') f.write('redirect %d' + str(response.status) + '\n') reason = 'redirect %d' %response.status return self._retry(request, reason, spider) or response interval, redirect_url = get_meta_refresh(response) # handle meta redirect if redirect_url: f.write("trying to redirect us: " + url + '\n') f.write('redirect meta' + '\n') reason = 'meta' return self._retry(request, reason, spider) or response hxs = HtmlXPathSelector(response) # test for captcha page captcha = hxs.select(".//input[contains(@id, 'captchacharacters')]").extract() if captcha: f.write("trying to redirect us: " + url + '\n') f.write('redirect capcha' + '\n') reason = 'capcha' return self._retry(request, reason, spider) or response return response
def process_response(self, request, response, spider): url = response.url if response.status in [301, 307]: log.msg("trying to redirect us: %s" %url, level=log.INFO) reason = 'redirect %d' %response.status return self._retry(request, reason, spider) or response interval, redirect_url = get_meta_refresh(response) if response.status in [502, 503]: log.msg("Service Unavailable: %s" %url, level=log.INFO) reason = 'Possible block with 5xx error' return self._retry(request, reason, spider) or response # handle meta redirect if redirect_url: log.msg("trying to redirect us: %s" %url, level=log.INFO) reason = 'meta' return self._retry(request, reason, spider) or response hxs = HtmlXPathSelector(response) # test for captcha page captcha = hxs.select(".//input[contains(@id, 'captchacharacters')]").extract() if captcha or "Dostęp zablokowany" in response.text: log.msg("captcha page %s" %url, level=log.INFO) reason = 'captcha' return self._retry(request, reason, spider) or response if response.status in [502, 503, 504]: log.msg("Captcha with status: \n%s" %response.text, level=log.INFO) reason = 'captcha' return self._retry(request, reason, spider) or response return response
def process_response(self, request, response, spider): if request.meta.get('dont_redirect', False) or request.method == 'HEAD' or \ not isinstance(response, HtmlResponse): return response interval, url = get_meta_refresh(response) if url and interval < self._maxdelay: redirected = self._redirect_request_using_get(request, url) return self._redirect(redirected, request, spider, 'meta refresh') return response
def process_response(self, request, response, spider): if "dont_redirect" in request.meta or request.method == "HEAD" or not isinstance(response, HtmlResponse): return response if isinstance(response, HtmlResponse): interval, url = get_meta_refresh(response) if url and interval < self._maxdelay: redirected = self._redirect_request_using_get(request, url) return self._redirect(redirected, request, spider, "meta refresh") return response
def process_response(self, request, response, spider): url = response.url if response.status in [301, 307]: log.msg("trying to redirect us: %s" %url, level=log.INFO) reason = 'redirect %d' %response.status return self._retry(request, reason, spider) or response interval, redirect_url = get_meta_refresh(response) # handle meta redirect if redirect_url: log.msg("trying to redirect us: %s" %url, level=log.INFO) reason = 'meta' return self._retry(request, reason, spider) or response
def process_response(self, request, response, spider): url = response.url if response.status in [301, 307]: log.msg("trying to redirect us: %s" % url, level=log.INFO) reason = 'redirect %d' % response.status return self._retry(request, reason, spider) or response interval, redirect_url = get_meta_refresh(response) # handle meta redirect if redirect_url: log.msg("trying to redirect us: %s" % url, level=log.INFO) reason = 'meta' return self._retry(request, reason, spider) or response
def process_response(self, request, response, spider): if (request.meta.get("dont_redirect", False) or request.method == "HEAD" or not isinstance(response, HtmlResponse)): return response interval, url = get_meta_refresh(response, ignore_tags=self._ignore_tags) if url and interval < self._maxdelay: redirected = self._redirect_request_using_get(request, url) return self._redirect(redirected, request, spider, "meta refresh") return response
def process_response(self, request, response, spider): if 'dont_redirect' in request.meta or request.method == 'HEAD' or \ not isinstance(response, HtmlResponse): return response if isinstance(response, HtmlResponse): interval, url = get_meta_refresh(response) if url and interval < self._maxdelay: redirected = self._redirect_request_using_get(request, url) # print response.body if (response.url.find("view") > 0): self.print_conversation(response) return self._redirect(redirected, request, spider, 'meta refresh') return response
def process_response(self, request, response, spider): if request.meta.get('dont_retry', False): return response if response.status in self.retry_http_codes: reason = response_status_message(response.status) return self._retry(request, reason, spider) or response interval, redirect_url= get_meta_refresh(response) if redirect_url: print('redirect_url',redirect_url,'*******','response.url',response.url) # this is your check if response.status == 403 and response.url: print('*********************response.url',response.url) return self._retry(request, 'response got xpath "{}"'.format(response.url), spider) or response return response
def process_response(self, request, response, spider): request.meta['dont_filter'] = True if 'dont_redirect' in request.meta or request.method == 'HEAD' or \ not isinstance(response, HtmlResponse) or request.meta.get('redirect_times') >= 1: request.meta['dont_redirect'] = True return response if isinstance(response, HtmlResponse): interval, url = get_meta_refresh(response) if url and interval < self._maxdelay: redirected = self._redirect_request_using_get(request, url) redirected.dont_filter = True return self._redirect(redirected, request, spider, 'meta refresh') return response
def process_response(self, request, response, spider): if response.status in [302, 303] and 'Location' in response.headers: redirected_url = urljoin_rfc(request.url, response.headers['location']) redirected = self._redirect_request_using_get(request, redirected_url) return self._redirect(redirected, request, spider, response.status) if response.status in [301, 307] and 'Location' in response.headers: redirected_url = urljoin_rfc(request.url, response.headers['location']) redirected = request.replace(url=redirected_url) return self._redirect(redirected, request, spider, response.status) interval, url = get_meta_refresh(response) if url and interval < self.max_metarefresh_delay: redirected = self._redirect_request_using_get(request, url) return self._redirect(redirected, request, spider, 'meta refresh') return response
def process_response(self, request, response, spider): get_list = {} obj = DmozSpider() redirect_filename = 'output/' + obj.redirect_filename from_url = request.url to_url = '' if response.status in [301, 302, 303, 307 ] and "Location" in response.headers: to_url = response.headers["Location"] get_list = self.extractgetList(from_url) redirect_dict = { "from_url": from_url, "params": get_list, "to_url": to_url } f = open(redirect_filename, 'a') json.dump(redirect_dict, f, indent=2) f.write(",") f.close() log.msg("trying to redirect : %s -> %s" % (from_url, to_url), level=log.INFO) reason = 'redirect %d' % response.status return self._retry(request, reason, spider) or response interval, redirect_url = get_meta_refresh(response) # handle meta redirect if redirect_url: get_list = self.extractgetList(to_url) redirect_dict = { "from_url": from_url, "params": get_list, "to_url": to_url } f = open(redirect_filename, 'a') json.dump(redirect_dict, f, indent=2) f.write(",") f.close() log.msg("trying to redirect : %s -> %s" % (from_url, to_url), level=log.INFO) reason = 'meta' return self._retry(request, reason, spider) or response return response
def process_response(self, request, response, spider): url = response.url if response.status in [301,302, 307]: log.msg("trying to redirect us: %s" %url, level=log.INFO) reason = 'redirect %d' %response.status return self._retry(request, reason, spider) or response interval, redirect_url = get_meta_refresh(response) # handle meta redirect if redirect_url: log.msg("trying to redirect us: %s" %url, level=log.INFO) reason = 'meta' return self._retry(request, reason, spider) or response print(response.url) userblocked = "userblocked" in response.url print(userblocked) # test for captcha page if userblocked: log.msg("blocked page %s" %url, level=log.INFO) reason = 'blocked' return self._retry(request, reason, spider) or response return response
def process_response(self, request, response, spider): url = response.url logging.debug( 'WangJun I am OK111' ) logging.debug( 'WangJun I am OK111 %d' %response.status ) print('Cookie',response) if response.status in [301, 302]: logging.info("WangJun trying to redirect us: %s" %url) reason = 'redirect %d' %response.status return self._retry(request, reason, spider) or response interval, redirect_url = get_meta_refresh(response) # handle meta redirect if redirect_url: logging.info("trying to redirect us: %s" %url) reason = 'meta' return self._retry(request, reason, spider) or response hxs = Selector(response) #HtmlXPathSelector(response) # test for captcha page captcha = hxs.xpath(".//input[contains(@id, 'captchacharacters')]").extract() if captcha: logging.info("captcha page %s" %url) reason = 'capcha' return self._retry(request, reason, spider) or response return response
def process_response(self, request, response, spider): # 跳过成功抓取的页面 if response.status == 200: return response url = response.url if response.status in [301, 307]: # req = urllib2.Request(url='http://imgs.co/u/16/10/06/JmQo.jpg') # opener = urllib2.build_opener() # reponse = opener.open(req) # return reponse # interval, redirect_url = get_meta_refresh(response) # log.msg('redirect response.body:%s' % # response.body.extract(), level=log.INFO) # url = request.meta['redirect_urls'] # url = 'https:' + url[5:] # log.msg("response redirect_url: %s" % redirect_url, level=log.INFO) # log.msg("request.headers: %s" % repr(request.headers)) # log.msg("request.body: %s" % repr(request.body)) # log.msg("response.headers: %s" % repr(response.headers)) # log.msg("response.request type: %s" % # type(response.request), level=log.INFO) # log.msg("response type: %s" % # type(response), level=log.INFO) # request.headers['Accept-Encoding'] = 'identity' # request.headers['Host'] = 'imgs.co' # request.headers['Connection'] = 'close' # del request.headers['Accept-Language'] # del request.headers['Accept'] # # del request.headers['Cookie'] # request.meta['dont_merge_cookies'] = True # """ # # 'Accept-Language': ['en'], # 'Accept-Encoding': ['identity'], # 'Host': ['imgs.co'], # 'Accept': ['text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'], # 'User-Agent': ['Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0'], 'Connection': ['close'], 'Referer': ['http://c2.1024mx.org/pw/htm_data/3/1611/470074.html'], # 'Cookie': ['__cfduid=dc80e726a73461e3823f2a312eb2b34cd1478357543'] # """ # log.msg("trying to redirect us: %s" % url, level=log.INFO) url = response.headers['Location'] logger.info("trying to redirect us: %s" % url) reason = 'redirect %d' % response.status # return response.request return self._retry(request, reason, spider) or response # else: # return response # get_meta_refresh方法导致 'Response' object has no attribute 'text' # 下段暂时通过try进行包装处理 try: # handle meta redirect interval, redirect_url = get_meta_refresh(response) if redirect_url: # log.msg("trying to redirect us: %s" % url, level=log.INFO) logger.info("trying to redirect us: %s" % url) reason = 'meta' return self._retry(request, reason, spider) or response except Exception, e: print 'str(Exception):\t', str(Exception) print 'str(e):\t\t', str(e) print 'repr(e):\t', repr(e) print 'e.message:\t', e.message print 'traceback.print_exc():' traceback.print_exc() print 'traceback.format_exc():\n%s' % traceback.format_exc()
def test_get_meta_refresh(self): body = """ <html> <head><title>Dummy</title><meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head> <body>blahablsdfsal&</body> </html>""" response = TextResponse(url='http://example.org', body=body) self.assertEqual(get_meta_refresh(response), (5, 'http://example.org/newpage')) # refresh without url should return (None, None) body = """<meta http-equiv="refresh" content="5" />""" response = TextResponse(url='http://example.org', body=body) self.assertEqual(get_meta_refresh(response), (None, None)) body = """<meta http-equiv="refresh" content="5; url=http://example.org/newpage" /></head>""" response = TextResponse(url='http://example.org', body=body) self.assertEqual(get_meta_refresh(response), (5, 'http://example.org/newpage')) # meta refresh in multiple lines body = """<html><head> <META HTTP-EQUIV="Refresh" CONTENT="1; URL=http://example.org/newpage">""" response = TextResponse(url='http://example.org', body=body) self.assertEqual(get_meta_refresh(response), (1, 'http://example.org/newpage')) # entities in the redirect url body = """<meta http-equiv="refresh" content="3; url='http://www.example.com/other'">""" response = TextResponse(url='http://example.com', body=body) self.assertEqual(get_meta_refresh(response), (3, 'http://www.example.com/other')) # relative redirects body = """<meta http-equiv="refresh" content="3; url=other.html">""" response = TextResponse(url='http://example.com/page/this.html', body=body) self.assertEqual(get_meta_refresh(response), (3, 'http://example.com/page/other.html')) # non-standard encodings (utf-16) body = """<meta http-equiv="refresh" content="3; url=http://example.com/redirect">""" body = body.decode('ascii').encode('utf-16') response = TextResponse(url='http://example.com', body=body, encoding='utf-16') self.assertEqual(get_meta_refresh(response), (3, 'http://example.com/redirect')) # non-ascii chars in the url (utf8) body = """<meta http-equiv="refresh" content="3; url=http://example.com/to\xc2\xa3">""" response = TextResponse(url='http://example.com', body=body, encoding='utf-8') self.assertEqual(get_meta_refresh(response), (3, 'http://example.com/to%C2%A3')) # non-ascii chars in the url (latin1) body = """<meta http-equiv="refresh" content="3; url=http://example.com/to\xa3">""" response = TextResponse(url='http://example.com', body=body, encoding='latin1') self.assertEqual(get_meta_refresh(response), (3, 'http://example.com/to%C2%A3')) # responses without refresh tag should return None None response = TextResponse(url='http://example.org') self.assertEqual(get_meta_refresh(response), (None, None)) response = TextResponse(url='http://example.org') self.assertEqual(get_meta_refresh(response), (None, None)) # html commented meta refresh header must not directed body = """<!--<meta http-equiv="refresh" content="3; url=http://example.com/">-->""" response = TextResponse(url='http://example.com', body=body) self.assertEqual(get_meta_refresh(response), (None, None)) # html comments must not interfere with uncommented meta refresh header body = """<!-- commented --><meta http-equiv="refresh" content="3; url=http://example.com/">-->""" response = TextResponse(url='http://example.com', body=body) self.assertEqual(get_meta_refresh(response), (3, 'http://example.com/')) # float refresh intervals body = """<meta http-equiv="refresh" content=".1;URL=index.html" />""" response = TextResponse(url='http://example.com', body=body) self.assertEqual(get_meta_refresh(response), (0.1, 'http://example.com/index.html')) body = """<meta http-equiv="refresh" content="3.1;URL=index.html" />""" response = TextResponse(url='http://example.com', body=body) self.assertEqual(get_meta_refresh(response), (3.1, 'http://example.com/index.html'))
def test_get_meta_refresh(self): body = """ <html> <head><title>Dummy</title><meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head> <body>blahablsdfsal&</body> </html>""" response = TextResponse(url='http://example.org', body=body) self.assertEqual(get_meta_refresh(response), (5, 'http://example.org/newpage')) # refresh without url should return (None, None) body = """<meta http-equiv="refresh" content="5" />""" response = TextResponse(url='http://example.org', body=body) self.assertEqual(get_meta_refresh(response), (None, None)) body = """<meta http-equiv="refresh" content="5; url=http://example.org/newpage" /></head>""" response = TextResponse(url='http://example.org', body=body) self.assertEqual(get_meta_refresh(response), (5, 'http://example.org/newpage')) # meta refresh in multiple lines body = """<html><head> <META HTTP-EQUIV="Refresh" CONTENT="1; URL=http://example.org/newpage">""" response = TextResponse(url='http://example.org', body=body) self.assertEqual(get_meta_refresh(response), (1, 'http://example.org/newpage')) # entities in the redirect url body = """<meta http-equiv="refresh" content="3; url='http://www.example.com/other'">""" response = TextResponse(url='http://example.com', body=body) self.assertEqual(get_meta_refresh(response), (3, 'http://www.example.com/other')) # relative redirects body = """<meta http-equiv="refresh" content="3; url=other.html">""" response = TextResponse(url='http://example.com/page/this.html', body=body) self.assertEqual(get_meta_refresh(response), (3, 'http://example.com/page/other.html')) # non-standard encodings (utf-16) body = """<meta http-equiv="refresh" content="3; url=http://example.com/redirect">""" body = body.decode('ascii').encode('utf-16') response = TextResponse(url='http://example.com', body=body, encoding='utf-16') self.assertEqual(get_meta_refresh(response), (3, 'http://example.com/redirect')) # non-ascii chars in the url (default encoding - utf8) body = """<meta http-equiv="refresh" content="3; url=http://example.com/to\xc2\xa3">""" response = TextResponse(url='http://example.com', body=body) self.assertEqual(get_meta_refresh(response), (3, 'http://example.com/to%C2%A3')) # non-ascii chars in the url (custom encoding - latin1) body = """<meta http-equiv="refresh" content="3; url=http://example.com/to\xa3">""" response = TextResponse(url='http://example.com', body=body, encoding='latin1') self.assertEqual(get_meta_refresh(response), (3, 'http://example.com/to%C2%A3')) # responses without refresh tag should return None None response = TextResponse(url='http://example.org') self.assertEqual(get_meta_refresh(response), (None, None)) response = TextResponse(url='http://example.org') self.assertEqual(get_meta_refresh(response), (None, None))