class TestReferrerOnRedirect(TestRefererMiddleware): settings = {'REFERRER_POLICY': 'scrapy.spidermiddlewares.referer.UnsafeUrlPolicy'} scenarii = [ ( 'http://scrapytest.org/1', # parent 'http://scrapytest.org/2', # target ( # redirections: code, URL (301, 'http://scrapytest.org/3'), (301, 'http://scrapytest.org/4'), ), b'http://scrapytest.org/1', # expected initial referer b'http://scrapytest.org/1', # expected referer for the redirection request ), ( 'https://scrapytest.org/1', 'https://scrapytest.org/2', ( # redirecting to non-secure URL (301, 'http://scrapytest.org/3'), ), b'https://scrapytest.org/1', b'https://scrapytest.org/1', ), ( 'https://scrapytest.org/1', 'https://scrapytest.com/2', ( # redirecting to non-secure URL: different origin (301, 'http://scrapytest.com/3'), ), b'https://scrapytest.org/1', b'https://scrapytest.org/1', ), ] def setUp(self): self.spider = Spider('foo') settings = Settings(self.settings) self.referrermw = RefererMiddleware(settings) self.redirectmw = RedirectMiddleware(settings) def test(self): for parent, target, redirections, init_referrer, final_referrer in self.scenarii: response = self.get_response(parent) request = self.get_request(target) out = list(self.referrermw.process_spider_output(response, [request], self.spider)) self.assertEqual(out[0].headers.get('Referer'), init_referrer) for status, url in redirections: response = Response(request.url, headers={'Location': url}, status=status) request = self.redirectmw.process_response(request, response, self.spider) self.referrermw.request_scheduled(request, self.spider) assert isinstance(request, Request) self.assertEqual(request.headers.get('Referer'), final_referrer)
def process_response(self, request, response, spider): """Process a spider response.""" if not hasattr(self, 'host_regex'): self.host_regex = self.get_host_regex(spider) result = RedirectMiddleware.process_response(self, request, response, spider) if isinstance(result, Request): # Check that the redirect request is not offsite if NoSubdomainOffsiteMiddleware.should_follow( self, result, spider): if ExclusionRuleMiddleware.should_follow(self, result, spider): return result else: logging.info( "Excluding redirect due to exclusion rule %s" % result.url) raise IgnoreRequest else: logging.info( "Excluding redirect due to no offsite domains %s" % result.url) raise IgnoreRequest else: return result
def process_response(self, request, response, spider): """Process a spider response.""" if not hasattr(self, 'host_regex'): self.host_regex = self.get_host_regex(spider) result = RedirectMiddleware.process_response( self, request, response, spider ) if isinstance(result, Request): # Check that the redirect request is not offsite if NoSubdomainOffsiteMiddleware.should_follow(self, result, spider): if ExclusionRuleMiddleware.should_follow(self, result, spider): return result else: logging.info("Excluding redirect due to exclusion rule %s" % result.url) raise IgnoreRequest else: logging.info("Excluding redirect due to no offsite domains %s" % result.url) raise IgnoreRequest else: return result
def process_response(self, request, response, spider): # if 'viewLessonArrange.aspx' in response.url and response.status==200: # try: # cid = re.search('kcdm=(\w{2}\d{3})', response.url).group(1) # if not cid: # set_trace() # set_trace() # del(self.rejected[cid]) # return response # except KeyError: # pass # set_trace() # TODO: Delete the outdated cookie if urlparse(request.url).path.split('/')[-1] == 'outTimePage.aspx': logger.debug('Cookie %s is out of date.' % request.cookies) return self._recover_post(request) if 'message=' in request.url: logger.debug(unquote(request.url).split('message=')[1]) # set_trace() logger.debug(request.meta['item']['cid'][0] +' rejected.') # self.rejected.update({request.meta['item']['cid'][0]:self._recover_post(request)}) return self._recover_post(request) ret = RedirectMiddleware.process_response(self, request, response, spider) # Record data to be posted if response.status == 302 and request.method == 'POST':# isinstance(request, FormRequest): # logger.debug(request.body) if not request.body: set_trace() ret.meta['post_body'] = request.body # for req in self.rejected.values(): # return req return ret
def setUp(self): self.crawler = get_crawler(Spider) self.spider = self.crawler._create_spider('foo') self.mw = RedirectMiddleware.from_crawler(self.crawler)
def setUp(self): self.spider = Spider('foo') settings = Settings(self.settings) self.referrermw = RefererMiddleware(settings) self.redirectmw = RedirectMiddleware(settings)
def __init__(self, settings): RedirectMiddleware.__init__(self, settings) DownloaderBaseMiddleware.__init__(self, settings) self.stats = self.crawler.stats