示例#1
0
    def test_log_debug(self):
        with LogCapture() as l:
            settings = {'DUPEFILTER_DEBUG': True,
                        'DUPEFILTER_CLASS': __name__  + '.FromCrawlerRFPDupeFilter'}
            crawler = get_crawler(SimpleSpider, settings_dict=settings)
            scheduler = Scheduler.from_crawler(crawler)
            spider = SimpleSpider.from_crawler(crawler)

            dupefilter = scheduler.df
            dupefilter.open()

            r1 = Request('http://scrapytest.org/index.html')
            r2 = Request('http://scrapytest.org/index.html',
                headers={'Referer': 'http://scrapytest.org/INDEX.html'}
            )

            dupefilter.log(r1, spider)
            dupefilter.log(r2, spider)

            assert crawler.stats.get_value('dupefilter/filtered') == 2
            l.check_present(('scrapy.dupefilters', 'DEBUG',
                ('Filtered duplicate request: <GET http://scrapytest.org/index.html>'
                ' (referer: None)')))
            l.check_present(('scrapy.dupefilters', 'DEBUG',
                ('Filtered duplicate request: <GET http://scrapytest.org/index.html>'
                ' (referer: http://scrapytest.org/INDEX.html)')))

            dupefilter.close('finished')
示例#2
0
    def test_log(self):
        with LogCapture() as l:
            settings = {
                'DUPEFILTER_DEBUG': False,
                'DUPEFILTER_CLASS': __name__ + '.FromCrawlerRFPDupeFilter'
            }
            crawler = get_crawler(SimpleSpider, settings_dict=settings)
            scheduler = Scheduler.from_crawler(crawler)
            spider = SimpleSpider.from_crawler(crawler)

            dupefilter = scheduler.df
            dupefilter.open()

            r1 = Request('http://scrapytest.org/index.html')
            r2 = Request('http://scrapytest.org/index.html')

            dupefilter.log(r1, spider)
            dupefilter.log(r2, spider)

            assert crawler.stats.get_value('dupefilter/filtered') == 2
            l.check_present(('scrapy.dupefilters', 'DEBUG', (
                'Filtered duplicate request: <GET http://scrapytest.org/index.html>'
                ' - no more duplicates will be shown'
                ' (see DUPEFILTER_DEBUG to show all duplicates)')))

            dupefilter.close('finished')
示例#3
0
 def test_retry_dns_error(self):
     with mock.patch('socket.gethostbyname',
                     side_effect=socket.gaierror(
                         -5, 'No address associated with hostname')):
         spider = SimpleSpider("http://example.com/")
         yield docrawl(spider)
         self._assert_retried()
示例#4
0
    def test_unbounded_response(self):
        # Completeness of responses without Content-Length or Transfer-Encoding
        # can not be determined, we treat them as valid but flagged as "partial"
        from urllib import urlencode
        query = urlencode({
            'raw':
            '''\
HTTP/1.1 200 OK
Server: Apache-Coyote/1.1
X-Powered-By: Servlet 2.4; JBoss-4.2.3.GA (build: SVNTag=JBoss_4_2_3_GA date=200807181417)/JBossWeb-2.0
Set-Cookie: JSESSIONID=08515F572832D0E659FD2B0D8031D75F; Path=/
Pragma: no-cache
Expires: Thu, 01 Jan 1970 00:00:00 GMT
Cache-Control: no-cache
Cache-Control: no-store
Content-Type: text/html;charset=UTF-8
Content-Language: en
Date: Tue, 27 Aug 2013 13:05:05 GMT
Connection: close

foo body
with multiples lines
'''
        })
        spider = SimpleSpider("http://localhost:8998/raw?{0}".format(query))
        yield docrawl(spider)
        log = get_testlog()
        self.assertEqual(log.count("Got response 200"), 1)
示例#5
0
    def test_log(self):
        with LogCapture() as log:
            settings = {
                'DUPEFILTER_DEBUG': False,
                'DUPEFILTER_CLASS': FromCrawlerRFPDupeFilter,
                'REQUEST_FINGERPRINTER_IMPLEMENTATION': 'VERSION'
            }
            crawler = get_crawler(SimpleSpider, settings_dict=settings)
            spider = SimpleSpider.from_crawler(crawler)
            dupefilter = _get_dupefilter(crawler=crawler)

            r1 = Request('http://scrapytest.org/index.html')
            r2 = Request('http://scrapytest.org/index.html')

            dupefilter.log(r1, spider)
            dupefilter.log(r2, spider)

            assert crawler.stats.get_value('dupefilter/filtered') == 2
            log.check_present((
                'scrapy.dupefilters', 'DEBUG',
                'Filtered duplicate request: <GET http://scrapytest.org/index.html> - no more'
                ' duplicates will be shown (see DUPEFILTER_DEBUG to show all duplicates)'
            ))

            dupefilter.close('finished')
示例#6
0
    def test_log_debug_default_dupefilter(self):
        with LogCapture() as log:
            settings = {
                'DUPEFILTER_DEBUG': True,
                'REQUEST_FINGERPRINTER_IMPLEMENTATION': 'VERSION'
            }
            crawler = get_crawler(SimpleSpider, settings_dict=settings)
            spider = SimpleSpider.from_crawler(crawler)
            dupefilter = _get_dupefilter(crawler=crawler)

            r1 = Request('http://scrapytest.org/index.html')
            r2 = Request(
                'http://scrapytest.org/index.html',
                headers={'Referer': 'http://scrapytest.org/INDEX.html'})

            dupefilter.log(r1, spider)
            dupefilter.log(r2, spider)

            assert crawler.stats.get_value('dupefilter/filtered') == 2
            log.check_present((
                'scrapy.dupefilters', 'DEBUG',
                'Filtered duplicate request: <GET http://scrapytest.org/index.html> (referer: None)'
            ))
            log.check_present((
                'scrapy.dupefilters', 'DEBUG',
                'Filtered duplicate request: <GET http://scrapytest.org/index.html>'
                ' (referer: http://scrapytest.org/INDEX.html)'))

            dupefilter.close('finished')
示例#7
0
 def test_https_noconnect(self):
     os.environ[
         'https_proxy'] = 'http://*****:*****@localhost:8888?noconnect'
     spider = SimpleSpider("https://*****:*****@localhost:8888'
示例#8
0
    def test_log_debug(self):
        with LogCapture() as l:
            settings = {'DUPEFILTER_DEBUG': True,
                        'DUPEFILTER_CLASS': __name__  + '.FromCrawlerRFPDupeFilter'}
            crawler = get_crawler(SimpleSpider, settings_dict=settings)
            scheduler = Scheduler.from_crawler(crawler)
            spider = SimpleSpider.from_crawler(crawler)

            dupefilter = scheduler.df
            dupefilter.open()

            r1 = Request('http://scrapytest.org/index.html')
            r2 = Request('http://scrapytest.org/index.html',
                headers={'Referer': 'http://scrapytest.org/INDEX.html'}
            )
            
            dupefilter.log(r1, spider)
            dupefilter.log(r2, spider)

            assert crawler.stats.get_value('dupefilter/filtered') == 2
            l.check_present(('scrapy.dupefilters', 'DEBUG',
                ('Filtered duplicate request: <GET http://scrapytest.org/index.html>'
                ' (referer: None)')))
            l.check_present(('scrapy.dupefilters', 'DEBUG',
                ('Filtered duplicate request: <GET http://scrapytest.org/index.html>'
                ' (referer: http://scrapytest.org/INDEX.html)')))

            dupefilter.close('finished')
示例#9
0
 def test_https_tunnel_auth_error(self):
     os.environ['https_proxy'] = 'http://*****:*****@localhost:8888'
     spider = SimpleSpider("https://*****:*****@localhost:8888'
示例#10
0
    def test_log(self):
        with LogCapture() as l:
            settings = {'DUPEFILTER_DEBUG': False,
                        'DUPEFILTER_CLASS': __name__  + '.FromCrawlerRFPDupeFilter'}
            crawler = get_crawler(SimpleSpider, settings_dict=settings)
            scheduler = Scheduler.from_crawler(crawler)
            spider = SimpleSpider.from_crawler(crawler)

            dupefilter = scheduler.df
            dupefilter.open()

            r1 = Request('http://scrapytest.org/index.html')
            r2 = Request('http://scrapytest.org/index.html')
            
            dupefilter.log(r1, spider)
            dupefilter.log(r2, spider)

            assert crawler.stats.get_value('dupefilter/filtered') == 2
            l.check_present(('scrapy.dupefilters', 'DEBUG', 
                ('Filtered duplicate request: <GET http://scrapytest.org/index.html>'
                ' - no more duplicates will be shown'
                ' (see DUPEFILTER_DEBUG to show all duplicates)')))

            dupefilter.close('finished')
示例#11
0
 def test_retry_conn_failed(self):
     spider = SimpleSpider("http://localhost:65432/status?n=503")
     yield docrawl(spider)
     self._assert_retried()
示例#12
0
 def test_retry_503(self):
     spider = SimpleSpider("http://localhost:8998/status?n=503")
     yield docrawl(spider)
     self._assert_retried()
示例#13
0
 def test_retry_conn_aborted(self):
     # connection lost before receiving data
     spider = SimpleSpider("http://localhost:8998/drop?abort=1")
     yield docrawl(spider)
     self._assert_retried()
示例#14
0
 def test_https_noconnect_auth_error(self):
     os.environ[
         'https_proxy'] = 'http://*****:*****@localhost:8888?noconnect'
     spider = SimpleSpider("https://localhost:8999/status?n=200")
     yield docrawl(spider)
     self._assert_got_response_code(407)
示例#15
0
 def test_https_connect_tunnel_error(self):
     spider = SimpleSpider("https://localhost:99999/status?n=200")
     yield docrawl(spider)
     self._assert_got_tunnel_error()
示例#16
0
 def test_https_connect_tunnel(self):
     spider = SimpleSpider("https://localhost:8999/status?n=200")
     yield docrawl(spider)
     self._assert_got_response_code(200)