def test_find_refresh_url(self): url = find_refresh_url(""" <meta http-equiv="refresh" content="5"> """) self.assertEqual(None, url) url = find_refresh_url(""" <meta http-equiv="refresh" content="5;URL='http://example.com/'"> """) self.assertEqual('http://example.com/', url) url = find_refresh_url(""" <meta http-equiv="refresh" content="0;URL='http://example.com/'"> """) self.assertEqual('http://example.com/', url) url = find_refresh_url(""" <meta http-equiv="refresh" content="5; url=http://example.com/"> """) self.assertEqual('http://example.com/', url) url = find_refresh_url(""" <meta http-equiv="refresh" content=" 0 ; url=http://example.com/"> """) self.assertEqual('http://example.com/', url)
def get_meta_refresh_url(self): return find_refresh_url(self.unicode_body())
def process_request_result(self, prepare_response_func=None): """ Process result of real request performed via transport extension. """ now = datetime.utcnow() # TODO: move into separate method if self.config['debug_post']: post = self.config['post'] or self.config['multipart_post'] if isinstance(post, dict): post = list(post.items()) if post: if isinstance(post, six.string_types): post = post[:self.config['debug_post_limit']] + '...' else: items = normalize_http_values(post, charset='utf-8') new_items = [] for key, value in items: if len(value) > self.config['debug_post_limit']: value = value[ :self.config['debug_post_limit']] + '...' else: value = value new_items.append((key, value)) post = '\n'.join('%-25s: %s' % x for x in new_items) if post: logger_network.debug('[%02d] POST request:\n%s\n' % (self.request_counter, post)) # It's important to delete old POST data after request is performed. # If POST data is not cleared then next request will try to use them # again! old_refresh_count = self.config['refresh_redirect_count'] self.reset_temporary_options() if prepare_response_func: self.doc = prepare_response_func(self.transport, self) else: self.doc = self.transport.prepare_response(self) # Workaround if self.doc.grab is None: self.doc.grab = weakref.proxy(self) if self.config['reuse_cookies']: self.cookies.update(self.doc.cookies) self.doc.timestamp = now self.config['charset'] = self.doc.charset if self.config['log_file']: with open(self.config['log_file'], 'wb') as out: out.write(self.doc.body) if self.config['cookiefile']: self.cookies.save_to_file(self.config['cookiefile']) if self.config['reuse_referer']: self.config['referer'] = self.doc.url self.copy_request_data() # Should be called after `copy_request_data` self.save_dumps() # TODO: check max redirect count if self.config['follow_refresh']: url = find_refresh_url(self.doc.unicode_body()) if url is not None: inc_count = old_refresh_count + 1 if inc_count > self.config['redirect_limit']: raise error.GrabTooManyRedirectsError() else: return self.request(url=url, refresh_redirect_count=inc_count) return None
def process_request_result(self, prepare_response_func=None): """ Process result of real request performed via transport extension. """ now = datetime.utcnow() # TODO: move into separate method if self.config['debug_post']: post = self.config['post'] or self.config['multipart_post'] if isinstance(post, dict): post = list(post.items()) if post: if isinstance(post, six.string_types): post = post[:self.config['debug_post_limit']] + '...' else: items = normalize_http_values(post, charset='utf-8') new_items = [] for key, value in items: if len(value) > self.config['debug_post_limit']: value = value[:self. config['debug_post_limit']] + '...' else: value = value new_items.append((key, value)) post = '\n'.join('%-25s: %s' % x for x in new_items) if post: logger_network.debug('[%02d] POST request:\n%s\n' % (self.request_counter, post)) # It's important to delete old POST data after request is performed. # If POST data is not cleared then next request will try to use them # again! old_refresh_count = self.config['refresh_redirect_count'] self.reset_temporary_options() if prepare_response_func: self.doc = prepare_response_func(self.transport, self) else: self.doc = self.transport.prepare_response(self) # Workaround if self.doc.grab is None: self.doc.grab = weakref.proxy(self) if self.config['reuse_cookies']: self.cookies.update(self.doc.cookies) self.doc.timestamp = now self.config['charset'] = self.doc.charset if self.config['log_file']: with open(self.config['log_file'], 'wb') as out: out.write(self.doc.body) if self.config['cookiefile']: self.cookies.save_to_file(self.config['cookiefile']) if self.config['reuse_referer']: self.config['referer'] = self.doc.url self.copy_request_data() # Should be called after `copy_request_data` self.save_dumps() # TODO: check max redirect count if self.config['follow_refresh']: url = find_refresh_url(self.doc.unicode_body()) if url is not None: inc_count = old_refresh_count + 1 if inc_count > self.config['redirect_limit']: raise error.GrabTooManyRedirectsError() else: return self.request(url=url, refresh_redirect_count=inc_count) return None