예제 #1
0
class TestReferrerOnRedirect(TestRefererMiddleware):

    settings = {'REFERRER_POLICY': 'scrapy.spidermiddlewares.referer.UnsafeUrlPolicy'}
    scenarii = [
        (
            'http://scrapytest.org/1',      # parent
            'http://scrapytest.org/2',      # target
            (
                # redirections: code, URL
                (301, 'http://scrapytest.org/3'),
                (301, 'http://scrapytest.org/4'),
            ),
            b'http://scrapytest.org/1',  # expected initial referer
            b'http://scrapytest.org/1',  # expected referer for the redirection request
        ),
        (
            'https://scrapytest.org/1',
            'https://scrapytest.org/2',
            (
                # redirecting to non-secure URL
                (301, 'http://scrapytest.org/3'),
            ),
            b'https://scrapytest.org/1',
            b'https://scrapytest.org/1',
        ),
        (
            'https://scrapytest.org/1',
            'https://scrapytest.com/2',
            (
                # redirecting to non-secure URL: different origin
                (301, 'http://scrapytest.com/3'),
            ),
            b'https://scrapytest.org/1',
            b'https://scrapytest.org/1',
        ),
    ]

    def setUp(self):
        self.spider = Spider('foo')
        settings = Settings(self.settings)
        self.referrermw = RefererMiddleware(settings)
        self.redirectmw = RedirectMiddleware(settings)

    def test(self):

        for parent, target, redirections, init_referrer, final_referrer in self.scenarii:
            response = self.get_response(parent)
            request = self.get_request(target)

            out = list(self.referrermw.process_spider_output(response, [request], self.spider))
            self.assertEqual(out[0].headers.get('Referer'), init_referrer)

            for status, url in redirections:
                response = Response(request.url, headers={'Location': url}, status=status)
                request = self.redirectmw.process_response(request, response, self.spider)
                self.referrermw.request_scheduled(request, self.spider)

            assert isinstance(request, Request)
            self.assertEqual(request.headers.get('Referer'), final_referrer)
class TestReferrerOnRedirect(TestRefererMiddleware):

    settings = {'REFERRER_POLICY': 'scrapy.spidermiddlewares.referer.UnsafeUrlPolicy'}
    scenarii = [
        (   'http://scrapytest.org/1',      # parent
            'http://scrapytest.org/2',      # target
            (
                # redirections: code, URL
                (301, 'http://scrapytest.org/3'),
                (301, 'http://scrapytest.org/4'),
            ),
            b'http://scrapytest.org/1', # expected initial referer
            b'http://scrapytest.org/1', # expected referer for the redirection request
        ),
        (   'https://scrapytest.org/1',
            'https://scrapytest.org/2',
            (
                # redirecting to non-secure URL
                (301, 'http://scrapytest.org/3'),
            ),
            b'https://scrapytest.org/1',
            b'https://scrapytest.org/1',
        ),
        (   'https://scrapytest.org/1',
            'https://scrapytest.com/2',
            (
                # redirecting to non-secure URL: different origin
                (301, 'http://scrapytest.com/3'),
            ),
            b'https://scrapytest.org/1',
            b'https://scrapytest.org/1',
        ),
    ]

    def setUp(self):
        self.spider = Spider('foo')
        settings = Settings(self.settings)
        self.referrermw = RefererMiddleware(settings)
        self.redirectmw = RedirectMiddleware(settings)

    def test(self):

        for parent, target, redirections, init_referrer, final_referrer in self.scenarii:
            response = self.get_response(parent)
            request = self.get_request(target)

            out = list(self.referrermw.process_spider_output(response, [request], self.spider))
            self.assertEqual(out[0].headers.get('Referer'), init_referrer)

            for status, url in redirections:
                response = Response(request.url, headers={'Location': url}, status=status)
                request = self.redirectmw.process_response(request, response, self.spider)
                self.referrermw.request_scheduled(request, self.spider)

            assert isinstance(request, Request)
            self.assertEqual(request.headers.get('Referer'), final_referrer)
예제 #3
0
 def process_response(self, request, response, spider):
     """Process a spider response."""
     if not hasattr(self, 'host_regex'):
         self.host_regex = self.get_host_regex(spider)
     result = RedirectMiddleware.process_response(self, request, response,
                                                  spider)
     if isinstance(result, Request):
         # Check that the redirect request is not offsite
         if NoSubdomainOffsiteMiddleware.should_follow(
                 self, result, spider):
             if ExclusionRuleMiddleware.should_follow(self, result, spider):
                 return result
             else:
                 logging.info(
                     "Excluding redirect due to exclusion rule %s" %
                     result.url)
                 raise IgnoreRequest
         else:
             logging.info(
                 "Excluding redirect due to no offsite domains %s" %
                 result.url)
             raise IgnoreRequest
     else:
         return result
예제 #4
0
 def process_response(self, request, response, spider):
     """Process a spider response."""
     if not hasattr(self, 'host_regex'):
         self.host_regex = self.get_host_regex(spider)
     result = RedirectMiddleware.process_response(
         self, request, response, spider
     )
     if isinstance(result, Request):
         # Check that the redirect request is not offsite
         if NoSubdomainOffsiteMiddleware.should_follow(self, result,
                                                       spider):
             if ExclusionRuleMiddleware.should_follow(self, result,
                                                      spider):
                 return result
             else:
                 logging.info("Excluding redirect due to exclusion rule %s" %
                             result.url)
                 raise IgnoreRequest
         else:
             logging.info("Excluding redirect due to no offsite domains %s" %
                         result.url)
             raise IgnoreRequest
     else:
         return result
예제 #5
0
    def process_response(self, request, response, spider):
        # if 'viewLessonArrange.aspx' in response.url and response.status==200:
        #     try:
        #         cid = re.search('kcdm=(\w{2}\d{3})', response.url).group(1)
        #         if not cid:
        #             set_trace()
        #         set_trace()
        #         del(self.rejected[cid])
        #         return response
        #     except KeyError:
        #         pass
        # set_trace()
        # TODO: Delete the outdated cookie
        if urlparse(request.url).path.split('/')[-1] == 'outTimePage.aspx':
            logger.debug('Cookie %s is out of date.' % request.cookies)
            return self._recover_post(request)

        if 'message=' in request.url:
            logger.debug(unquote(request.url).split('message=')[1])
            # set_trace()
            logger.debug(request.meta['item']['cid'][0] +' rejected.')
            # self.rejected.update({request.meta['item']['cid'][0]:self._recover_post(request)})
            return self._recover_post(request)

        ret = RedirectMiddleware.process_response(self, request, response, spider)

        # Record data to be posted
        if response.status == 302 and request.method == 'POST':# isinstance(request, FormRequest):
            # logger.debug(request.body)
            if not request.body:
                set_trace()
            ret.meta['post_body'] = request.body

        # for req in self.rejected.values():
            # return req
        return ret
 def setUp(self):
     self.crawler = get_crawler(Spider)
     self.spider = self.crawler._create_spider('foo')
     self.mw = RedirectMiddleware.from_crawler(self.crawler)
예제 #7
0
 def setUp(self):
     self.spider = Spider('foo')
     settings = Settings(self.settings)
     self.referrermw = RefererMiddleware(settings)
     self.redirectmw = RedirectMiddleware(settings)
 def setUp(self):
     self.spider = Spider('foo')
     settings = Settings(self.settings)
     self.referrermw = RefererMiddleware(settings)
     self.redirectmw = RedirectMiddleware(settings)
 def setUp(self):
     self.crawler = get_crawler(Spider)
     self.spider = self.crawler._create_spider('foo')
     self.mw = RedirectMiddleware.from_crawler(self.crawler)
예제 #10
0
 def __init__(self, settings):
     RedirectMiddleware.__init__(self, settings)
     DownloaderBaseMiddleware.__init__(self, settings)
     self.stats = self.crawler.stats