class LogStats(object): def __init__(self, engine, clock=None): self.interval = engine.settings.get_float('LOG_STATS_INTERVAL') if not self.interval: raise NotConfigured self.multiplier = 60.0 / self.interval self.logging = LoopingCall(self.log, clock=clock) engine.signals.connect(self.engine_started, signal=signals.engine_started) engine.signals.connect(self.engine_stopped, signal=signals.engine_stopped) engine.signals.connect(self.response_downloaded, signal=signals.response_downloaded) self.downloaded = 0 self.downloaded_prev = 0 def engine_started(self): self.logging.schedule(self.interval) def engine_stopped(self): self.logging.cancel() def response_downloaded(self): self.downloaded += 1 def log(self): downloaded_speed = (self.downloaded - self.downloaded_prev) * self.multiplier self.downloaded_prev = self.downloaded log.msg(format='Crawled %(down)d pages (at %(speed)d pages/min).', level=log.INFO, down=self.downloaded, speed=downloaded_speed)
def test_init(self): # test initializing LoopingCall without overriding its clock sc = LoopingCall(self.obj.func, *self.default_args, **self.default_kwargs) sc.schedule() sc.cancel()
class LoopingCallTest(unittest.TestCase): default_args = (10, 'hello') default_kwargs = {'a': 47, 'b': 'c'} def setUp(self): self.clock = Clock() self.obj = ModifiedObject() self.sc = LoopingCall(self.obj.func, clock=self.clock, *self.default_args, **self.default_kwargs) def _check(self, args, kwargs): if args is None: self.assertIsNone(self.obj.args) else: self.assertTupleEqual(self.obj.args, args) if kwargs is None: self.assertIsNone(self.obj.kwargs) else: self.assertEqual(self.obj.kwargs, kwargs) def test_init(self): # test initializing LoopingCall without overriding its clock sc = LoopingCall(self.obj.func, *self.default_args, **self.default_kwargs) sc.schedule() sc.cancel() def test_basic(self): # scheduling self.assertFalse(self.sc.is_scheduled()) self.sc.schedule(2, count=2, now=False) # before the first call self.assertTrue(self.sc.is_scheduled()) self.assertEqual(self.sc.get_time(), 2) self.assertEqual(self.sc.calls_left(), 2) self.clock.advance(1) self.assertEqual(self.obj.num_calls, 0) # after the first call self.clock.advance(1) self.assertEqual(self.obj.num_calls, 1) self.assertEqual(self.sc.calls_left(), 1) self._check(self.default_args, self.default_kwargs) # after the second call self.clock.advance(2) self.assertEqual(self.obj.num_calls, 2) self.assertEqual(self.sc.calls_left(), 0) self.assertFalse(self.sc.is_scheduled()) # no more calls self.clock.advance(20) self.assertEqual(self.obj.num_calls, 2) def test_now(self): self.sc.schedule(2, count=2, now=True) self.assertEqual(self.obj.num_calls, 0) self.clock.advance(0) self.assertEqual(self.obj.num_calls, 1) self.clock.advance(20) self.assertEqual(self.obj.num_calls, 2) def test_infinite(self): self.sc.schedule(2) self.clock.pump([2] * 100) self.assertEqual(self.obj.num_calls, 100) self.assertTrue(self.sc.is_scheduled()) self.assertIsNone(self.sc.calls_left()) def test_cancel(self): self.sc.schedule(2) self.clock.advance(1) self.sc.cancel() self.clock.advance(20) self.assertEqual(self.obj.num_calls, 0) def test_reschedule(self): self.sc.schedule(2) self.clock.advance(1) self.sc.schedule(5) self.clock.advance(4) self.assertEqual(self.obj.num_calls, 0) self.clock.advance(1) self.assertEqual(self.obj.num_calls, 1) def test_no_delay(self): self.sc.schedule() self.assertEqual(self.obj.num_calls, 0) self.clock.advance(0) self.assertEqual(self.obj.num_calls, 1) self.clock.advance(0) self.assertEqual(self.obj.num_calls, 2) def test_nested_schedule(self): def func(*args, **kwargs): self.obj.func(*args, **kwargs) self.sc.schedule() self.sc.func = func self.sc.schedule() self.assertEqual(self.obj.num_calls, 0) self.clock.advance(0) self.assertEqual(self.obj.num_calls, 1) self.clock.advance(0) self.assertEqual(self.obj.num_calls, 2)
class Downloader(object): '''Fetch requests from `request_queue` queue. When downloaded, put the results into `response_queue` queue. Respect CONCURRENT_REQUESTS setting. Requests are further divided into specific slots, based on their domains. ''' # how many seconds to wait between the checks of request_queue QUEUE_CHECK_FREQUENCY = 0.1 def __init__(self, settings, request_queue, response_queue, download_handler=None, clock=None): self.request_queue = request_queue self.response_queue = response_queue # queue of responses self.download_handler = download_handler or GeneralHandler(settings) self.slots = {} self.num_in_progress = 0 self.clock = clock or reactor self.processing = LoopingCall(self.process, clock=self.clock) self.processing.schedule(self.QUEUE_CHECK_FREQUENCY, now=True) self.running = True self.download_delay = settings.get_int('DOWNLOAD_DELAY') self.randomize_delay = settings.get_int( 'RANDOMIZE_DOWNLOAD_DELAY') if self.download_delay: self.total_concurrency = self.domain_concurrency = 1 self.use_domain_specific = False else: self.total_concurrency = settings.get_int( 'CONCURRENT_REQUESTS') self.domain_concurrency = settings.get_int( 'CONCURRENT_REQUESTS_PER_DOMAIN') if (not self.domain_concurrency or self.domain_concurrency >= self.total_concurrency): self.use_domain_specific = False self.domain_concurrency = self.total_concurrency else: self.use_domain_specific = True def close(self): self.processing.cancel() self.running = False @property def free_slots(self): return self.total_concurrency - self.num_in_progress def is_idle(self): return self.num_in_progress == 0 def process(self): while (self.running and not self.response_queue.needs_backout() and self.request_queue and self.free_slots > 0): request = self.request_queue.pop() key, slot = self._get_slot(request) def remove_in_progress(response): self.num_in_progress -= 1 self._clear_slots() # clear empty slots return response def enqueue_result(request, result): # in a case, result is actually a Failure result.request = request # make sure not to modify response_queue, after stopping the downloader if self.running: self.response_queue.push(result) # don't return anything from here, in a case an error occured - # we don't want it to be logged self.num_in_progress += 1 dfd = defer.Deferred().addBoth(remove_in_progress) dfd.addBoth(partial(enqueue_result, request)) slot.enqueue(request, dfd) def _get_slot(self, request): key = request.parsed_url.hostname if self.use_domain_specific else '' if key not in self.slots: self.slots[key] = Slot( self.download_handler, self.domain_concurrency, self.download_delay, self.randomize_delay, clock=self.clock) return key, self.slots[key] def _clear_slots(self): '''Clear unused slots and avoid memory leaking.''' if len(self.slots) >= 2 * self.total_concurrency: to_delete = [k for (k, v) in self.slots.iteritems() if v.is_idle()] for key in to_delete: del self.slots[key]
class Downloader(object): '''Fetch requests from `request_queue` queue. When downloaded, put the results into `response_queue` queue. Respect CONCURRENT_REQUESTS setting. Requests are further divided into specific slots, based on their domains. ''' # how many seconds to wait between the checks of request_queue QUEUE_CHECK_FREQUENCY = 0.1 def __init__(self, settings, request_queue, response_queue, download_handler=None, clock=None): self.request_queue = request_queue self.response_queue = response_queue # queue of responses self.download_handler = download_handler or GeneralHandler(settings) self.slots = {} self.num_in_progress = 0 self.clock = clock or reactor self.processing = LoopingCall(self.process, clock=self.clock) self.processing.schedule(self.QUEUE_CHECK_FREQUENCY, now=True) self.running = True self.download_delay = settings.get_float('DOWNLOAD_DELAY') self.randomize_delay = settings.get_int( 'RANDOMIZE_DOWNLOAD_DELAY') if self.download_delay: self.total_concurrency = self.domain_concurrency = 1 self.use_domain_specific = False else: self.total_concurrency = settings.get_int( 'CONCURRENT_REQUESTS') self.domain_concurrency = settings.get_int( 'CONCURRENT_REQUESTS_PER_DOMAIN') if (not self.domain_concurrency or self.domain_concurrency >= self.total_concurrency): self.use_domain_specific = False self.domain_concurrency = self.total_concurrency else: self.use_domain_specific = True def close(self): self.processing.cancel() self.running = False @property def free_slots(self): return self.total_concurrency - self.num_in_progress def is_idle(self): return self.num_in_progress == 0 def process(self): while (self.running and not self.response_queue.needs_backout() and self.request_queue and self.free_slots > 0): request = self.request_queue.pop() key, slot = self._get_slot(request) def remove_in_progress(response): self.num_in_progress -= 1 self._clear_slots() # clear empty slots return response def enqueue_result(request, result): # in a case, result is actually a Failure result.request = request # make sure not to modify response_queue, after stopping the downloader if self.running: self.response_queue.push(result) # don't return anything from here, in a case an error occured - # we don't want it to be logged self.num_in_progress += 1 dfd = defer.Deferred().addBoth(remove_in_progress) dfd.addBoth(partial(enqueue_result, request)) slot.enqueue(request, dfd) def _get_slot(self, request): key = request.parsed_url.hostname if self.use_domain_specific else '' if key not in self.slots: self.slots[key] = Slot( self.download_handler, self.domain_concurrency, self.download_delay, self.randomize_delay, clock=self.clock) return key, self.slots[key] def _clear_slots(self): '''Clear unused slots and avoid memory leaking.''' if len(self.slots) >= 2 * self.total_concurrency: to_delete = [k for (k, v) in self.slots.iteritems() if v.is_idle()] for key in to_delete: del self.slots[key]