예제 #1
0
    def test_start_requests_dupes(self):
        settings = {"CONCURRENT_REQUESTS": 1}
        spider = DuplicateStartRequestsSpider(dont_filter=True,
                                              distinct_urls=2,
                                              dupe_factor=3)
        yield docrawl(spider, settings)
        self.assertEqual(spider.visited, 6)

        spider = DuplicateStartRequestsSpider(dont_filter=False,
                                              distinct_urls=3,
                                              dupe_factor=4)
        yield docrawl(spider, settings)
        self.assertEqual(spider.visited, 3)
예제 #2
0
    def test_start_requests_dupes(self):
        settings = {"CONCURRENT_REQUESTS": 1}
        spider = DuplicateStartRequestsSpider(dont_filter=True,
                                              distinct_urls=2,
                                              dupe_factor=3)
        yield docrawl(spider, settings)
        self.assertEqual(spider.visited, 6)

        spider = DuplicateStartRequestsSpider(dont_filter=False,
                                              distinct_urls=3,
                                              dupe_factor=4)
        yield docrawl(spider, settings)
        self.assertEqual(spider.visited, 3)
예제 #3
0
 def test_timeout_failure(self):
     spider = DelaySpider(n=0.5)
     yield docrawl(spider, {"DOWNLOAD_TIMEOUT": 0.35})
     self.assertTrue(spider.t1 > 0)
     self.assertTrue(spider.t2 == 0)
     self.assertTrue(spider.t2_err > 0)
     self.assertTrue(spider.t2_err > spider.t1)
     # server hangs after receiving response headers
     spider = DelaySpider(n=0.5, b=1)
     yield docrawl(spider, {"DOWNLOAD_TIMEOUT": 0.35})
     self.assertTrue(spider.t1 > 0)
     self.assertTrue(spider.t2 == 0)
     self.assertTrue(spider.t2_err > 0)
     self.assertTrue(spider.t2_err > spider.t1)
예제 #4
0
 def test_timeout_failure(self):
     spider = DelaySpider(n=0.5)
     yield docrawl(spider, {"DOWNLOAD_TIMEOUT": 0.35})
     self.assertTrue(spider.t1 > 0)
     self.assertTrue(spider.t2 == 0)
     self.assertTrue(spider.t2_err > 0)
     self.assertTrue(spider.t2_err > spider.t1)
     # server hangs after receiving response headers
     spider = DelaySpider(n=0.5, b=1)
     yield docrawl(spider, {"DOWNLOAD_TIMEOUT": 0.35})
     self.assertTrue(spider.t1 > 0)
     self.assertTrue(spider.t2 == 0)
     self.assertTrue(spider.t2_err > 0)
     self.assertTrue(spider.t2_err > spider.t1)
예제 #5
0
 def test_https_noconnect(self):
     os.environ[
         'https_proxy'] = 'http://*****:*****@localhost:8888?noconnect'
     spider = SimpleSpider("https://*****:*****@localhost:8888'
예제 #6
0
 def test_https_tunnel_without_leak_proxy_authorization_header(self):
     request = Request("https://localhost:8999/echo")
     spider = SingleRequestSpider(seed=request)
     yield docrawl(spider)
     self._assert_got_response_code(200)
     echo = json.loads(spider.meta['responses'][0].body)
     self.assertTrue('Proxy-Authorization' not in echo['headers'])
예제 #7
0
 def test_referer_header(self):
     """Referer header is set by RefererMiddleware unless it is already set"""
     req0 = Request('http://localhost:8998/echo?headers=1&body=0', dont_filter=1)
     req1 = req0.replace()
     req2 = req0.replace(headers={'Referer': None})
     req3 = req0.replace(headers={'Referer': 'http://example.com'})
     req0.meta['next'] = req1
     req1.meta['next'] = req2
     req2.meta['next'] = req3
     spider = SingleRequestSpider(seed=req0)
     yield docrawl(spider)
     # basic asserts in case of weird communication errors
     self.assertIn('responses', spider.meta)
     self.assertNotIn('failures', spider.meta)
     # start requests doesn't set Referer header
     echo0 = json.loads(spider.meta['responses'][2].body)
     self.assertNotIn('Referer', echo0['headers'])
     # following request sets Referer to start request url
     echo1 = json.loads(spider.meta['responses'][1].body)
     self.assertEqual(echo1['headers'].get('Referer'), [req0.url])
     # next request avoids Referer header
     echo2 = json.loads(spider.meta['responses'][2].body)
     self.assertNotIn('Referer', echo2['headers'])
     # last request explicitly sets a Referer header
     echo3 = json.loads(spider.meta['responses'][3].body)
     self.assertEqual(echo3['headers'].get('Referer'), ['http://example.com'])
예제 #8
0
 def test_https_noconnect(self):
     os.environ[
         'https_proxy'] = 'http://*****:*****@localhost:8888?noconnect'
     spider = SimpleSpider("https://*****:*****@localhost:8888'
예제 #9
0
 def test_https_tunnel_without_leak_proxy_authorization_header(self):
     request = Request("https://localhost:8999/echo")
     spider = SingleRequestSpider(seed=request)
     yield docrawl(spider)
     self._assert_got_response_code(200)
     echo = json.loads(spider.meta['responses'][0].body)
     self.assertTrue('Proxy-Authorization' not in echo['headers'])
예제 #10
0
    def test_unbounded_response(self):
        # Completeness of responses without Content-Length or Transfer-Encoding
        # can not be determined, we treat them as valid but flagged as "partial"
        from urllib import urlencode
        query = urlencode({'raw': '''\
HTTP/1.1 200 OK
Server: Apache-Coyote/1.1
X-Powered-By: Servlet 2.4; JBoss-4.2.3.GA (build: SVNTag=JBoss_4_2_3_GA date=200807181417)/JBossWeb-2.0
Set-Cookie: JSESSIONID=08515F572832D0E659FD2B0D8031D75F; Path=/
Pragma: no-cache
Expires: Thu, 01 Jan 1970 00:00:00 GMT
Cache-Control: no-cache
Cache-Control: no-store
Content-Type: text/html;charset=UTF-8
Content-Language: en
Date: Tue, 27 Aug 2013 13:05:05 GMT
Connection: close

foo body
with multiples lines
'''})
        spider = SimpleSpider("http://localhost:8998/raw?{0}".format(query))
        yield docrawl(spider)
        log = get_testlog()
        self.assertEqual(log.count("Got response 200"), 1)
예제 #11
0
    def test_unbounded_response(self):
        # Completeness of responses without Content-Length or Transfer-Encoding
        # can not be determined, we treat them as valid but flagged as "partial"
        from urllib import urlencode
        query = urlencode({'raw': '''\
HTTP/1.1 200 OK
Server: Apache-Coyote/1.1
X-Powered-By: Servlet 2.4; JBoss-4.2.3.GA (build: SVNTag=JBoss_4_2_3_GA date=200807181417)/JBossWeb-2.0
Set-Cookie: JSESSIONID=08515F572832D0E659FD2B0D8031D75F; Path=/
Pragma: no-cache
Expires: Thu, 01 Jan 1970 00:00:00 GMT
Cache-Control: no-cache
Cache-Control: no-store
Content-Type: text/html;charset=UTF-8
Content-Language: en
Date: Tue, 27 Aug 2013 13:05:05 GMT
Connection: close

foo body
with multiples lines
'''})
        spider = SimpleSpider("http://localhost:8998/raw?{0}".format(query))
        yield docrawl(spider)
        log = get_testlog()
        self.assertEqual(log.count("Got response 200"), 1)
예제 #12
0
 def test_referer_header(self):
     """Referer header is set by RefererMiddleware unless it is already set"""
     req0 = Request('http://localhost:8998/echo?headers=1&body=0', dont_filter=1)
     req1 = req0.replace()
     req2 = req0.replace(headers={'Referer': None})
     req3 = req0.replace(headers={'Referer': 'http://example.com'})
     req0.meta['next'] = req1
     req1.meta['next'] = req2
     req2.meta['next'] = req3
     spider = SingleRequestSpider(seed=req0)
     yield docrawl(spider)
     # basic asserts in case of weird communication errors
     self.assertIn('responses', spider.meta)
     self.assertNotIn('failures', spider.meta)
     # start requests doesn't set Referer header
     echo0 = json.loads(spider.meta['responses'][2].body)
     self.assertNotIn('Referer', echo0['headers'])
     # following request sets Referer to start request url
     echo1 = json.loads(spider.meta['responses'][1].body)
     self.assertEqual(echo1['headers'].get('Referer'), [req0.url])
     # next request avoids Referer header
     echo2 = json.loads(spider.meta['responses'][2].body)
     self.assertNotIn('Referer', echo2['headers'])
     # last request explicitly sets a Referer header
     echo3 = json.loads(spider.meta['responses'][3].body)
     self.assertEqual(echo3['headers'].get('Referer'), ['http://example.com'])
예제 #13
0
 def test_retry_dns_error(self):
     with mock.patch('socket.gethostbyname',
                     side_effect=socket.gaierror(
                         -5, 'No address associated with hostname')):
         spider = SimpleSpider("http://example.com/")
         yield docrawl(spider)
         self._assert_retried()
예제 #14
0
 def test_https_tunnel_auth_error(self):
     os.environ['https_proxy'] = 'http://*****:*****@localhost:8888'
     spider = SimpleSpider("https://*****:*****@localhost:8888'
예제 #15
0
 def test_closespider_pagecount(self):
     spider = FollowAllSpider()
     close_on = 5
     yield docrawl(spider, {'CLOSESPIDER_PAGECOUNT': close_on})
     reason = spider.meta['close_reason']
     self.assertEqual(reason, 'closespider_pagecount')
     pagecount = spider.crawler.stats.get_value('response_received_count')
     self.assertTrue(pagecount >= close_on)
예제 #16
0
 def test_closespider_itemcount(self):
     spider = ItemSpider()
     close_on = 5
     yield docrawl(spider, {'CLOSESPIDER_ITEMCOUNT': close_on})
     reason = spider.meta['close_reason']
     self.assertEqual(reason, 'closespider_itemcount')
     itemcount = spider.crawler.stats.get_value('item_scraped_count')
     self.assertTrue(itemcount >= close_on)
예제 #17
0
 def test_closespider_pagecount(self):
     spider = FollowAllSpider()
     close_on = 5
     yield docrawl(spider, {"CLOSESPIDER_PAGECOUNT": close_on})
     reason = spider.meta["close_reason"]
     self.assertEqual(reason, "closespider_pagecount")
     pagecount = spider.crawler.stats.get_value("response_received_count")
     self.assertTrue(pagecount >= close_on)
예제 #18
0
 def test_https_tunnel_auth_error(self):
     os.environ['https_proxy'] = 'http://*****:*****@localhost:8888'
     spider = SimpleSpider("https://*****:*****@localhost:8888'
예제 #19
0
 def test_closespider_itemcount(self):
     spider = ItemSpider()
     close_on = 5
     yield docrawl(spider, {"CLOSESPIDER_ITEMCOUNT": close_on})
     reason = spider.meta["close_reason"]
     self.assertEqual(reason, "closespider_itemcount")
     itemcount = spider.crawler.stats.get_value("item_scraped_count")
     self.assertTrue(itemcount >= close_on)
예제 #20
0
 def test_closespider_errorcount(self):
     spider = ErrorSpider(total=1000000)
     close_on = 5
     yield docrawl(spider, {"CLOSESPIDER_ERRORCOUNT": close_on})
     self.flushLoggedErrors(spider.exception_cls)
     reason = spider.meta["close_reason"]
     self.assertEqual(reason, "closespider_errorcount")
     key = "spider_exceptions/{name}".format(name=spider.exception_cls.__name__)
     errorcount = spider.crawler.stats.get_value(key)
     self.assertTrue(errorcount >= close_on)
예제 #21
0
 def _test_delay(self, delay, randomize):
     settings = {"DOWNLOAD_DELAY": delay, 'RANDOMIZE_DOWNLOAD_DELAY': randomize}
     spider = FollowAllSpider(maxlatency=delay * 2)
     yield docrawl(spider, settings)
     t = spider.times
     totaltime = t[-1] - t[0]
     avgd = totaltime / (len(t) - 1)
     tolerance = 0.6 if randomize else 0.2
     self.assertTrue(avgd > delay * (1 - tolerance),
                     "download delay too small: %s" % avgd)
예제 #22
0
 def _test_delay(self, delay, randomize):
     settings = {"DOWNLOAD_DELAY": delay, 'RANDOMIZE_DOWNLOAD_DELAY': randomize}
     spider = FollowAllSpider(maxlatency=delay * 2)
     yield docrawl(spider, settings)
     t = spider.times
     totaltime = t[-1] - t[0]
     avgd = totaltime / (len(t) - 1)
     tolerance = 0.6 if randomize else 0.2
     self.assertTrue(avgd > delay * (1 - tolerance),
                     "download delay too small: %s" % avgd)
예제 #23
0
 def test_closespider_errorcount(self):
     spider = ErrorSpider(total=1000000)
     close_on = 5
     yield docrawl(spider, {'CLOSESPIDER_ERRORCOUNT': close_on})
     self.flushLoggedErrors(spider.exception_cls)
     reason = spider.meta['close_reason']
     self.assertEqual(reason, 'closespider_errorcount')
     key = 'spider_exceptions/{name}'\
             .format(name=spider.exception_cls.__name__)
     errorcount = spider.crawler.stats.get_value(key)
     self.assertTrue(errorcount >= close_on)
예제 #24
0
 def test_closespider_timeout(self):
     spider = FollowAllSpider(total=1000000)
     close_on = 0.1
     yield docrawl(spider, {"CLOSESPIDER_TIMEOUT": close_on})
     reason = spider.meta["close_reason"]
     self.assertEqual(reason, "closespider_timeout")
     stats = spider.crawler.stats
     start = stats.get_value("start_time")
     stop = stats.get_value("finish_time")
     diff = stop - start
     total_seconds = diff.seconds + diff.microseconds
     self.assertTrue(total_seconds >= close_on)
예제 #25
0
 def test_closespider_timeout(self):
     spider = FollowAllSpider(total=1000000)
     close_on = 0.1
     yield docrawl(spider, {'CLOSESPIDER_TIMEOUT': close_on})
     reason = spider.meta['close_reason']
     self.assertEqual(reason, 'closespider_timeout')
     stats = spider.crawler.stats
     start = stats.get_value('start_time')
     stop = stats.get_value('finish_time')
     diff = stop - start
     total_seconds = diff.seconds + diff.microseconds
     self.assertTrue(total_seconds >= close_on)
    def test_logging(self):
        spider = _HttpErrorSpider(bypass_status_codes={402})
        yield docrawl(spider)
        # print(get_testlog())
        self.assertEqual(spider.parsed, {'200', '402'})
        self.assertEqual(spider.skipped, {'402'})
        self.assertEqual(spider.failed, {'404', '500'})

        log = get_testlog()
        self.assertIn('Ignoring response <404', log)
        self.assertIn('Ignoring response <500', log)
        self.assertNotIn('Ignoring response <200', log)
        self.assertNotIn('Ignoring response <402', log)
예제 #27
0
    def test_engine_status(self):
        from scrapy.utils.engine import get_engine_status
        est = []

        def cb(response):
            est.append(get_engine_status(spider.crawler.engine))

        spider = SingleRequestSpider(seed='http://localhost:8998/', callback_func=cb)
        yield docrawl(spider)
        self.assertEqual(len(est), 1, est)
        s = dict(est[0])
        self.assertEqual(s['engine.spider.name'], spider.name)
        self.assertEqual(s['len(engine.scraper.slot.active)'], 1)
    def test_logging(self):
        spider = _HttpErrorSpider(bypass_status_codes={402})
        yield docrawl(spider)
        # print(get_testlog())
        self.assertEqual(spider.parsed, {'200', '402'})
        self.assertEqual(spider.skipped, {'402'})
        self.assertEqual(spider.failed, {'404', '500'})

        log = get_testlog()
        self.assertIn('Ignoring response <404', log)
        self.assertIn('Ignoring response <500', log)
        self.assertNotIn('Ignoring response <200', log)
        self.assertNotIn('Ignoring response <402', log)
예제 #29
0
    def test_engine_status(self):
        from scrapy.utils.engine import get_engine_status
        est = []

        def cb(response):
            est.append(get_engine_status(spider.crawler.engine))

        spider = SingleRequestSpider(seed='http://localhost:8998/', callback_func=cb)
        yield docrawl(spider)
        self.assertEqual(len(est), 1, est)
        s = dict(est[0])
        self.assertEqual(s['engine.spider.name'], spider.name)
        self.assertEqual(s['len(engine.scraper.slot.active)'], 1)
예제 #30
0
 def test_start_requests_lazyness(self):
     settings = {"CONCURRENT_REQUESTS": 1}
     spider = BrokenStartRequestsSpider()
     yield docrawl(spider, settings)
예제 #31
0
 def test_follow_all(self):
     spider = FollowAllSpider()
     yield docrawl(spider)
     self.assertEqual(len(spider.urls_visited), 11)  # 10 + start_url
예제 #32
0
 def test_https_connect_tunnel(self):
     spider = SimpleSpider("https://localhost:8999/status?n=200")
     yield docrawl(spider)
     self._assert_got_response_code(200)
예제 #33
0
 def test_https_connect_tunnel(self):
     spider = SimpleSpider("https://localhost:8999/status?n=200")
     yield docrawl(spider)
     self._assert_got_response_code(200)
예제 #34
0
 def test_https_noconnect_auth_error(self):
     os.environ[
         'https_proxy'] = 'http://*****:*****@localhost:8888?noconnect'
     spider = SimpleSpider("https://localhost:8999/status?n=200")
     yield docrawl(spider)
     self._assert_got_response_code(407)
예제 #35
0
 def test_retry_dns_error(self):
     with mock.patch('socket.gethostbyname',
                     side_effect=socket.gaierror(-5, 'No address associated with hostname')):
         spider = SimpleSpider("http://example.com/")
         yield docrawl(spider)
         self._assert_retried()
예제 #36
0
 def test_retry_503(self):
     spider = SimpleSpider("http://localhost:8998/status?n=503")
     yield docrawl(spider)
     self._assert_retried()
예제 #37
0
 def test_timeout_success(self):
     spider = DelaySpider(n=0.5)
     yield docrawl(spider)
     self.assertTrue(spider.t1 > 0)
     self.assertTrue(spider.t2 > 0)
     self.assertTrue(spider.t2 > spider.t1)
 def test_middleware_works(self):
     spider = _HttpErrorSpider()
     yield docrawl(spider)
     assert not spider.skipped, spider.skipped
     self.assertEqual(spider.parsed, {'200'})
     self.assertEqual(spider.failed, {'404', '402', '500'})
 def test_middleware_works(self):
     spider = _HttpErrorSpider()
     yield docrawl(spider)
     assert not spider.skipped, spider.skipped
     self.assertEqual(spider.parsed, {'200'})
     self.assertEqual(spider.failed, {'404', '402', '500'})
예제 #40
0
 def test_follow_all(self):
     spider = FollowAllSpider()
     yield docrawl(spider)
     self.assertEqual(len(spider.urls_visited), 11)  # 10 + start_url
예제 #41
0
 def test_retry_conn_failed(self):
     spider = SimpleSpider("http://localhost:65432/status?n=503")
     yield docrawl(spider)
     self._assert_retried()
예제 #42
0
 def test_retry_503(self):
     spider = SimpleSpider("http://localhost:8998/status?n=503")
     yield docrawl(spider)
     self._assert_retried()
예제 #43
0
 def test_https_connect_tunnel_error(self):
     spider = SimpleSpider("https://localhost:99999/status?n=200")
     yield docrawl(spider)
     self._assert_got_tunnel_error()
예제 #44
0
 def test_start_requests_lazyness(self):
     settings = {"CONCURRENT_REQUESTS": 1}
     spider = BrokenStartRequestsSpider()
     yield docrawl(spider, settings)
예제 #45
0
 def test_retry_conn_failed(self):
     spider = SimpleSpider("http://localhost:65432/status?n=503")
     yield docrawl(spider)
     self._assert_retried()
예제 #46
0
 def test_retry_conn_aborted(self):
     # connection lost before receiving data
     spider = SimpleSpider("http://localhost:8998/drop?abort=1")
     yield docrawl(spider)
     self._assert_retried()
예제 #47
0
 def test_start_requests_bug_before_yield(self):
     spider = BrokenStartRequestsSpider(fail_before_yield=1)
     yield docrawl(spider)
     errors = self.flushLoggedErrors(ZeroDivisionError)
     self.assertEqual(len(errors), 1)
예제 #48
0
 def test_retry_conn_aborted(self):
     # connection lost before receiving data
     spider = SimpleSpider("http://localhost:8998/drop?abort=1")
     yield docrawl(spider)
     self._assert_retried()
예제 #49
0
 def test_https_connect_tunnel_error(self):
     spider = SimpleSpider("https://localhost:99999/status?n=200")
     yield docrawl(spider)
     self._assert_got_tunnel_error()
예제 #50
0
 def test_https_noconnect_auth_error(self):
     os.environ[
         'https_proxy'] = 'http://*****:*****@localhost:8888?noconnect'
     spider = SimpleSpider("https://localhost:8999/status?n=200")
     yield docrawl(spider)
     self._assert_got_response_code(407)
예제 #51
0
 def test_start_requests_bug_before_yield(self):
     spider = BrokenStartRequestsSpider(fail_before_yield=1)
     yield docrawl(spider)
     errors = self.flushLoggedErrors(ZeroDivisionError)
     self.assertEqual(len(errors), 1)
예제 #52
0
 def test_timeout_success(self):
     spider = DelaySpider(n=0.5)
     yield docrawl(spider)
     self.assertTrue(spider.t1 > 0)
     self.assertTrue(spider.t2 > 0)
     self.assertTrue(spider.t2 > spider.t1)