def setUp(self): self.clock = Clock() self.request_queue = MemoryQueue() self.response_queue = ResponseQueue() self.dwn = Downloader(Settings(self.default_settings), self.request_queue, self.response_queue, download_handler=MockDownloaderHandler( Settings()), clock=self.clock) self.handler = self.dwn.download_handler
def __init__(self, download_handler, concurrency, delay, randomize_delay, clock=None): self.download_handler = download_handler self.concurrency = concurrency self.delay = delay self.randomize_delay = randomize_delay self.in_progress = set() # request waiting to be downloaded self.transferring = set() # requests being downloaded (subset of `in_progress`) self.last_download_time = 0 self.queue = MemoryQueue() # queue of (request, deferred) # clock is used in unittests self.clock = clock or reactor self.delayed_processing = ScheduledCall(self._process, clock=self.clock)
def setUp(self): self.clock = Clock() self.request_queue = MemoryQueue() self.response_queue = ResponseQueue() self.dwn = Downloader(Settings(self.default_settings), self.request_queue, self.response_queue, download_handler=MockDownloaderHandler(Settings()), clock=self.clock) self.handler = self.dwn.download_handler
def setup(self): assert self.spider is not None, 'Spider is not set in Engine.' # IMPORTANT: order of the following initializations is very important # so please, think twice about any changes to it # initialize logging if self.settings.get_bool('LOG_ENABLED'): log.start(self.settings['LOG_FILE'], self.settings['LOG_LEVEL'], self.settings['LOG_STDOUT'], self.settings['LOG_ENCODING']) # initialize signals self.signals = SignalManager(self) #initialize stats stats_cls = load_object(self.settings.get('STATS_CLASS')) self.stats = stats_cls(self) # initialize downloader self.request_queue = PriorityQueue(lambda _: MemoryQueue()) self.response_queue = ResponseQueue( self.settings.get_int('RESPONSE_ACTIVE_SIZE_LIMIT')) self.downloader = Downloader(self.settings, self.request_queue, self.response_queue, clock=self.clock) # initialize extensions self.extensions = ExtensionManager(self) # initialize downloader pipeline self.pipeline = PipelineManager(self) self.initialized = True # now that everything is ready, set the spider's engine self.spider.set_engine(self)
class Slot(object): '''Slot represents a queue of requests for one particular domain. It respects both DOWNLOAD_DELAY and CONCURRENT_REQUESTS_PER_DOMAIN. ''' def __init__(self, download_handler, concurrency, delay, randomize_delay, clock=None): self.download_handler = download_handler self.concurrency = concurrency self.delay = delay self.randomize_delay = randomize_delay self.in_progress = set() # request waiting to be downloaded self.transferring = set() # requests being downloaded (subset of `in_progress`) self.last_download_time = 0 self.queue = MemoryQueue() # queue of (request, deferred) # clock is used in unittests self.clock = clock or reactor self.delayed_processing = ScheduledCall(self._process, clock=self.clock) def enqueue(self, request, dfd): '''Main entry point. Put the new request to the queue and if possible, start downloading it. ''' def remove_in_progress(response): self.in_progress.remove(request) return response self.in_progress.add(request) dfd.addBoth(remove_in_progress) self.queue.push((request, dfd)) self._process() @property def free_slots(self): return self.concurrency - len(self.transferring) def is_idle(self): return len(self.in_progress) == 0 def _process(self): '''Process the requests in the queue, while respecting the delay and concurrency. ''' if self.delayed_processing.is_scheduled() or self._schedule_delay(): return while self.queue and self.free_slots > 0: self.last_download_time = self.clock.seconds() request, downloaded_dfd = self.queue.pop() dfd = self._download(request) dfd.chainDeferred(downloaded_dfd) if self._schedule_delay(): return def _schedule_delay(self): if self.delay: penalty = (self.last_download_time + self.get_download_delay() - self.clock.seconds()) if penalty > 0: # following schedule should always be successfull, because # `_schedule_delay()` is only called from within `_process()` self.delayed_processing.schedule(penalty) return True return False def _download(self, request): dfd = defer.succeed(request) # download the response dfd.addCallback(self.download_handler.download_request) # it is VERY important to wrap the failure into a new object! # For errors like ConnectionLost, the same Failure object is returned # everytime and we cannot use 'failure.request' field. def wrap_failure(failure): return Failure(failure.value) dfd.addErrback(wrap_failure) # put the request into the set of `transferring` to block other requests # after the response is downloaded, remove it from `transferring` def remove_transferring(response): self.transferring.remove(request) self._process() # process unblocked requests return response self.transferring.add(request) dfd.addBoth(remove_transferring) return dfd def get_download_delay(self): if self.randomize_delay: return random.uniform(0.5 * self.delay, 1.5 * self.delay) return self.delay
class DownloaderTest(unittest.TestCase): default_settings = { 'CONCURRENT_REQUESTS': 2, 'CONCURRENT_REQUESTS_PER_DOMAIN': 1, 'DOWNLOAD_DELAY': 0, 'RANDOMIZE_DOWNLOAD_DELAY': False} def setUp(self): self.clock = Clock() self.request_queue = MemoryQueue() self.response_queue = ResponseQueue() self.dwn = Downloader(Settings(self.default_settings), self.request_queue, self.response_queue, download_handler=MockDownloaderHandler(Settings()), clock=self.clock) self.handler = self.dwn.download_handler def _update_dwn(self, **kwargs): '''Update downloader with the new settings. ''' new_settings = self.default_settings.copy() new_settings.update(**kwargs) self.dwn.processing.cancel() self.dwn = Downloader(Settings(new_settings), self.request_queue, self.response_queue, download_handler=MockDownloaderHandler(Settings()), clock=self.clock) self.handler = self.dwn.download_handler def test_concurrency(self): # standard situation self._update_dwn() self.assertEqual(self.dwn.total_concurrency, 2) self.assertEqual(self.dwn.domain_concurrency, 1) self.assertTrue(self.dwn.use_domain_specific) # delay set self._update_dwn(CONCURRENT_REQUESTS=10, CONCURRENT_REQUESTS_PER_DOMAIN=5, DOWNLOAD_DELAY=5) self.assertEqual(self.dwn.total_concurrency, 1) self.assertEqual(self.dwn.domain_concurrency, 1) self.assertFalse(self.dwn.use_domain_specific) # domain concurrency is 0 self._update_dwn(CONCURRENT_REQUESTS=10, CONCURRENT_REQUESTS_PER_DOMAIN=0) self.assertEqual(self.dwn.total_concurrency, 10) self.assertEqual(self.dwn.domain_concurrency, 10) self.assertFalse(self.dwn.use_domain_specific) # domain concurrency is too big self._update_dwn(CONCURRENT_REQUESTS=5, CONCURRENT_REQUESTS_PER_DOMAIN=10) self.assertEqual(self.dwn.total_concurrency, 5) self.assertEqual(self.dwn.domain_concurrency, 5) self.assertFalse(self.dwn.use_domain_specific) self._update_dwn(CONCURRENT_REQUESTS=5, CONCURRENT_REQUESTS_PER_DOMAIN=5) self.assertFalse(self.dwn.use_domain_specific) def test_get_slot(self): key, slot = self.dwn._get_slot(Request('http://www.github.com/')) self.assertEqual(key, 'www.github.com') key2, slot2 = self.dwn._get_slot(Request('http://www.github.com/hello/world#bla')) self.assertEqual(key2, 'www.github.com') self.assertIs(slot2, slot) key3, slot3 = self.dwn._get_slot(Request('http://sites.github.com/')) self.assertEqual(key3, 'sites.github.com') self.assertIsNot(slot3, slot) self.assertEqual(len(self.dwn.slots), 2) # don't use domain specific slots self.dwn.use_domain_specific = False key, slot = self.dwn._get_slot(Request('http://www.github.com/')) self.assertEqual(key, '') key2, slot2 = self.dwn._get_slot(Request('http://sites.github.com/')) self.assertIs(slot2, slot) def test_basic(self): # create 5 requests with slot ids: a, b, a, a, c requests = [get_request(id)[0] for id in 'abaac'] map(lambda r: self.request_queue.push(r), requests) self.assertEqual(self.dwn.free_slots, 2) self.assertTrue(self.dwn.is_idle()) # start downloading first two requests self.clock.advance(0) self.assertEqual(self.dwn.free_slots, 0) self.assertFalse(self.dwn.is_idle()) # no more requests are scheduled, until download is finished self.clock.advance(20) self.assertEqual(len(self.request_queue), 3) # download the first request self.handler.call(requests[0], Response('hello')) self.assertEqual(self.dwn.free_slots, 1) # slot is immediately available # result is also available result = self.response_queue.peek() self.assertIs(result.request, requests[0]) self.assertEqual(result.url, 'hello') # enqueue third request self.clock.advance(Downloader.QUEUE_CHECK_FREQUENCY) self.assertEqual(self.dwn.free_slots, 0) # download second request self.handler.call(requests[1], Response('')) # enqueue fourth request self.clock.advance(Downloader.QUEUE_CHECK_FREQUENCY) self.assertEqual(self.dwn.free_slots, 0) # fourth request should not begin download, until 3rd request is done self.assertRaises(KeyError, self.handler.call, requests[3], Response('')) # finish self.handler.call(requests[2], Response('')) self.handler.call(requests[3], Response('')) self.clock.advance(Downloader.QUEUE_CHECK_FREQUENCY) self.handler.call(requests[4], Response('')) # final checks self.clock.pump([1] * 10) self.assertEqual(len(self.response_queue), 5) self.assertTrue(self.dwn.is_idle()) def test_close(self): req1 = get_request('a')[0] req2 = get_request('b')[0] self.request_queue.push(req1) self.clock.advance(20) self.request_queue.push(req2) # test basic attributes, before and after closing self.assertTrue(self.dwn.running) self.assertTrue(self.dwn.processing.is_scheduled()) self.dwn.close() self.assertFalse(self.dwn.running) self.assertFalse(self.dwn.processing.is_scheduled()) self.clock.advance(20) self.assertEqual(len(self.request_queue), 1) # request 2 remains unqueued # downloader behavior after closing self.assertEqual(len(self.response_queue), 0) self.handler.call(req1, Response('')) self.assertEqual(len(self.response_queue), 0) def test_fail(self): self._update_dwn(CONCURRENT_REQUESTS=3, CONCURRENT_REQUESTS_PER_DOMAIN=2) requests = [get_request(id)[0] for id in 'aab'] map(lambda r: self.request_queue.push(r), requests) # enqueue requests self.clock.advance(0) # fail 1st request err = ValueError('my bad') self.handler.fail(requests[0], err) self.assertEqual(self.dwn.free_slots, 1) fail = self.response_queue.pop() self.assertIs(fail.request, requests[0]) self.assertIs(fail.value, err) # fail 3rd request self.handler.fail(requests[2], err) fail = self.response_queue.pop() self.assertIs(fail.request, requests[2]) self.assertIs(fail.value, err) # succeed 2nd request self.handler.call(requests[1], Response('nice!', request=requests[1])) resp = self.response_queue.pop() self.assertIs(resp.request, requests[1]) self.assertEqual(resp.url, 'nice!') def test_clear_slots(self): requests = [get_request(id)[0] for id in xrange(30)] for r in requests: self.request_queue.push(r) self.clock.advance(Downloader.QUEUE_CHECK_FREQUENCY) self.handler.call(r, Response('')) self.assertLessEqual(len(self.dwn.slots), 2 * self.dwn.total_concurrency)
class DownloaderTest(unittest.TestCase): default_settings = { 'CONCURRENT_REQUESTS': 2, 'CONCURRENT_REQUESTS_PER_DOMAIN': 1, 'DOWNLOAD_DELAY': 0, 'RANDOMIZE_DOWNLOAD_DELAY': False } def setUp(self): self.clock = Clock() self.request_queue = MemoryQueue() self.response_queue = ResponseQueue() self.dwn = Downloader(Settings(self.default_settings), self.request_queue, self.response_queue, download_handler=MockDownloaderHandler( Settings()), clock=self.clock) self.handler = self.dwn.download_handler def _update_dwn(self, **kwargs): '''Update downloader with the new settings. ''' new_settings = self.default_settings.copy() new_settings.update(**kwargs) self.dwn.processing.cancel() self.dwn = Downloader(Settings(new_settings), self.request_queue, self.response_queue, download_handler=MockDownloaderHandler( Settings()), clock=self.clock) self.handler = self.dwn.download_handler def test_concurrency(self): # standard situation self._update_dwn() self.assertEqual(self.dwn.total_concurrency, 2) self.assertEqual(self.dwn.domain_concurrency, 1) self.assertTrue(self.dwn.use_domain_specific) # delay set self._update_dwn(CONCURRENT_REQUESTS=10, CONCURRENT_REQUESTS_PER_DOMAIN=5, DOWNLOAD_DELAY=3.14) self.assertEqual(self.dwn.download_delay, 3.14) self.assertEqual(self.dwn.total_concurrency, 1) self.assertEqual(self.dwn.domain_concurrency, 1) self.assertFalse(self.dwn.use_domain_specific) # domain concurrency is 0 self._update_dwn(CONCURRENT_REQUESTS=10, CONCURRENT_REQUESTS_PER_DOMAIN=0) self.assertEqual(self.dwn.total_concurrency, 10) self.assertEqual(self.dwn.domain_concurrency, 10) self.assertFalse(self.dwn.use_domain_specific) # domain concurrency is too big self._update_dwn(CONCURRENT_REQUESTS=5, CONCURRENT_REQUESTS_PER_DOMAIN=10) self.assertEqual(self.dwn.total_concurrency, 5) self.assertEqual(self.dwn.domain_concurrency, 5) self.assertFalse(self.dwn.use_domain_specific) self._update_dwn(CONCURRENT_REQUESTS=5, CONCURRENT_REQUESTS_PER_DOMAIN=5) self.assertFalse(self.dwn.use_domain_specific) def test_get_slot(self): key, slot = self.dwn._get_slot(Request('http://www.github.com/')) self.assertEqual(key, 'www.github.com') key2, slot2 = self.dwn._get_slot( Request('http://www.github.com/hello/world#bla')) self.assertEqual(key2, 'www.github.com') self.assertIs(slot2, slot) key3, slot3 = self.dwn._get_slot(Request('http://sites.github.com/')) self.assertEqual(key3, 'sites.github.com') self.assertIsNot(slot3, slot) self.assertEqual(len(self.dwn.slots), 2) # don't use domain specific slots self.dwn.use_domain_specific = False key, slot = self.dwn._get_slot(Request('http://www.github.com/')) self.assertEqual(key, '') key2, slot2 = self.dwn._get_slot(Request('http://sites.github.com/')) self.assertIs(slot2, slot) def test_basic(self): # create 5 requests with slot ids: a, b, a, a, c requests = [get_request(id)[0] for id in 'abaac'] map(lambda r: self.request_queue.push(r), requests) self.assertEqual(self.dwn.free_slots, 2) self.assertTrue(self.dwn.is_idle()) # start downloading first two requests self.clock.advance(0) self.assertEqual(self.dwn.free_slots, 0) self.assertFalse(self.dwn.is_idle()) # no more requests are scheduled, until download is finished self.clock.advance(20) self.assertEqual(len(self.request_queue), 3) # download the first request self.handler.call(requests[0], Response('hello')) self.assertEqual(self.dwn.free_slots, 1) # slot is immediately available # result is also available result = self.response_queue.peek() self.assertIs(result.request, requests[0]) self.assertEqual(result.url, 'hello') # enqueue third request self.clock.advance(Downloader.QUEUE_CHECK_FREQUENCY) self.assertEqual(self.dwn.free_slots, 0) # download second request self.handler.call(requests[1], Response('')) # enqueue fourth request self.clock.advance(Downloader.QUEUE_CHECK_FREQUENCY) self.assertEqual(self.dwn.free_slots, 0) # fourth request should not begin download, until 3rd request is done self.assertRaises(KeyError, self.handler.call, requests[3], Response('')) # finish self.handler.call(requests[2], Response('')) self.handler.call(requests[3], Response('')) self.clock.advance(Downloader.QUEUE_CHECK_FREQUENCY) self.handler.call(requests[4], Response('')) # final checks self.clock.pump([1] * 10) self.assertEqual(len(self.response_queue), 5) self.assertTrue(self.dwn.is_idle()) def test_close(self): req1 = get_request('a')[0] req2 = get_request('b')[0] self.request_queue.push(req1) self.clock.advance(20) self.request_queue.push(req2) # test basic attributes, before and after closing self.assertTrue(self.dwn.running) self.assertTrue(self.dwn.processing.is_scheduled()) self.dwn.close() self.assertFalse(self.dwn.running) self.assertFalse(self.dwn.processing.is_scheduled()) self.clock.advance(20) self.assertEqual(len(self.request_queue), 1) # request 2 remains unqueued # downloader behavior after closing self.assertEqual(len(self.response_queue), 0) self.handler.call(req1, Response('')) self.assertEqual(len(self.response_queue), 0) def test_fail(self): self._update_dwn(CONCURRENT_REQUESTS=3, CONCURRENT_REQUESTS_PER_DOMAIN=2) requests = [get_request(id)[0] for id in 'aab'] map(lambda r: self.request_queue.push(r), requests) # enqueue requests self.clock.advance(0) # fail 1st request err = ValueError('my bad') self.handler.fail(requests[0], err) self.assertEqual(self.dwn.free_slots, 1) fail = self.response_queue.pop() self.assertIs(fail.request, requests[0]) self.assertIs(fail.value, err) # fail 3rd request self.handler.fail(requests[2], err) fail = self.response_queue.pop() self.assertIs(fail.request, requests[2]) self.assertIs(fail.value, err) # succeed 2nd request self.handler.call(requests[1], Response('nice!', request=requests[1])) resp = self.response_queue.pop() self.assertIs(resp.request, requests[1]) self.assertEqual(resp.url, 'nice!') def test_clear_slots(self): requests = [get_request(id)[0] for id in xrange(30)] for r in requests: self.request_queue.push(r) self.clock.advance(Downloader.QUEUE_CHECK_FREQUENCY) self.handler.call(r, Response('')) self.assertLessEqual(len(self.dwn.slots), 2 * self.dwn.total_concurrency)
def qfactory(priority): return MemoryQueue()