class ScheduledCallTest(unittest.TestCase): default_args = (10, 'hello') default_kwargs = {'a': 47, 'b': 'c'} def setUp(self): self.clock = Clock() self.obj = ModifiedObject() self.sc = ScheduledCall(self.obj.func, clock=self.clock, *self.default_args, **self.default_kwargs) def _check(self, args, kwargs): if args is None: self.assertIsNone(self.obj.args) else: self.assertTupleEqual(self.obj.args, args) if kwargs is None: self.assertIsNone(self.obj.kwargs) else: self.assertEqual(self.obj.kwargs, kwargs) def test_init(self): # test initializing ScheduledCall without overriding its clock sc = ScheduledCall(self.obj.func, *self.default_args, **self.default_kwargs) sc.schedule() sc.cancel() def test_get_time_and_is_scheduled(self): self.clock.advance(10) self.assertFalse(self.sc.is_scheduled()) self.assertEqual(self.sc.get_time(), 0) self.sc.schedule(5) self.assertTrue(self.sc.is_scheduled()) self.assertEqual(self.sc.get_time(), 15) self.clock.advance(5) self.assertFalse(self.sc.is_scheduled()) self.assertEqual(self.sc.get_time(), 0) def test_no_delay(self): self.sc.schedule() self._check(None, None) self.clock.advance(0) self._check(self.default_args, self.default_kwargs) def test_default(self): self.assertTrue(self.sc.schedule(5)) self._check(None, None) self.clock.advance(1) self.assertFalse(self.sc.schedule(1)) self.clock.advance(2) self._check(None, None) self.clock.advance(3) self._check(self.default_args, self.default_kwargs) def test_cancel(self): self.sc.schedule(5) self.clock.advance(3) self.sc.cancel() self.clock.advance(3) self._check(None, None) self.assertTrue(self.sc.schedule(1)) self.clock.advance(1) self._check(self.default_args, self.default_kwargs) def test_overwrite(self): over_args = ('crawlmi',) over_kwargs = {'a': 50, 'd': 'e'} self.sc.schedule(5, *over_args, **over_kwargs) self.clock.advance(5) self._check(over_args, over_kwargs) def test_partial_overwrite(self): over_args = ('crawlmi',) self.sc.schedule(5, *over_args) self.clock.advance(5) self._check(over_args, {}) def test_nested_schedule(self): def func(*args, **kwargs): self.obj.func(*args, **kwargs) self.sc.schedule() self.sc.func = func self.sc.schedule() self.assertEqual(self.obj.num_calls, 0) self.clock.advance(0) self.assertEqual(self.obj.num_calls, 1) self.clock.advance(0) self.assertEqual(self.obj.num_calls, 2) def test_nested_schedule_delay(self): args1 = ('a',) kwargs1 = {'a': 'b'} args2 = ('b',) kwargs2 = {'b': 'c'} def func(*args, **kwargs): self.obj.func(*args, **kwargs) self.sc.schedule(4, *args2, **kwargs2) self.sc.func = func self.sc.schedule(3, *args1, **kwargs1) self.clock.advance(3) self.assertIsNotNone(self.sc._call) self._check(args1, kwargs1) self.clock.advance(3) self._check(args1, kwargs1) self.clock.advance(1) self._check(args2, kwargs2)
class Slot(object): '''Slot represents a queue of requests for one particular domain. It respects both DOWNLOAD_DELAY and CONCURRENT_REQUESTS_PER_DOMAIN. ''' def __init__(self, download_handler, concurrency, delay, randomize_delay, clock=None): self.download_handler = download_handler self.concurrency = concurrency self.delay = delay self.randomize_delay = randomize_delay self.in_progress = set() # request waiting to be downloaded self.transferring = set() # requests being downloaded (subset of `in_progress`) self.last_download_time = 0 self.queue = MemoryQueue() # queue of (request, deferred) # clock is used in unittests self.clock = clock or reactor self.delayed_processing = ScheduledCall(self._process, clock=self.clock) def enqueue(self, request, dfd): '''Main entry point. Put the new request to the queue and if possible, start downloading it. ''' def remove_in_progress(response): self.in_progress.remove(request) return response self.in_progress.add(request) dfd.addBoth(remove_in_progress) self.queue.push((request, dfd)) self._process() @property def free_slots(self): return self.concurrency - len(self.transferring) def is_idle(self): return len(self.in_progress) == 0 def _process(self): '''Process the requests in the queue, while respecting the delay and concurrency. ''' if self.delayed_processing.is_scheduled() or self._schedule_delay(): return while self.queue and self.free_slots > 0: self.last_download_time = self.clock.seconds() request, downloaded_dfd = self.queue.pop() dfd = self._download(request) dfd.chainDeferred(downloaded_dfd) if self._schedule_delay(): return def _schedule_delay(self): if self.delay: penalty = (self.last_download_time + self.get_download_delay() - self.clock.seconds()) if penalty > 0: # following schedule should always be successfull, because # `_schedule_delay()` is only called from within `_process()` self.delayed_processing.schedule(penalty) return True return False def _download(self, request): dfd = defer.succeed(request) # download the response dfd.addCallback(self.download_handler.download_request) # it is VERY important to wrap the failure into a new object! # For errors like ConnectionLost, the same Failure object is returned # everytime and we cannot use 'failure.request' field. def wrap_failure(failure): return Failure(failure.value) dfd.addErrback(wrap_failure) # put the request into the set of `transferring` to block other requests # after the response is downloaded, remove it from `transferring` def remove_transferring(response): self.transferring.remove(request) self._process() # process unblocked requests return response self.transferring.add(request) dfd.addBoth(remove_transferring) return dfd def get_download_delay(self): if self.randomize_delay: return random.uniform(0.5 * self.delay, 1.5 * self.delay) return self.delay
class ScheduledCallTest(unittest.TestCase): default_args = (10, 'hello') default_kwargs = {'a': 47, 'b': 'c'} def setUp(self): self.clock = Clock() self.obj = ModifiedObject() self.sc = ScheduledCall(self.obj.func, clock=self.clock, *self.default_args, **self.default_kwargs) def _check(self, args, kwargs): if args is None: self.assertIsNone(self.obj.args) else: self.assertTupleEqual(self.obj.args, args) if kwargs is None: self.assertIsNone(self.obj.kwargs) else: self.assertEqual(self.obj.kwargs, kwargs) def test_init(self): # test initializing ScheduledCall without overriding its clock sc = ScheduledCall(self.obj.func, *self.default_args, **self.default_kwargs) sc.schedule() sc.cancel() def test_get_time_and_is_scheduled(self): self.clock.advance(10) self.assertFalse(self.sc.is_scheduled()) self.assertEqual(self.sc.get_time(), 0) self.sc.schedule(5) self.assertTrue(self.sc.is_scheduled()) self.assertEqual(self.sc.get_time(), 15) self.clock.advance(5) self.assertFalse(self.sc.is_scheduled()) self.assertEqual(self.sc.get_time(), 0) def test_no_delay(self): self.sc.schedule() self._check(None, None) self.clock.advance(0) self._check(self.default_args, self.default_kwargs) def test_default(self): self.assertTrue(self.sc.schedule(5)) self._check(None, None) self.clock.advance(1) self.assertFalse(self.sc.schedule(1)) self.clock.advance(2) self._check(None, None) self.clock.advance(3) self._check(self.default_args, self.default_kwargs) def test_cancel(self): self.sc.schedule(5) self.clock.advance(3) self.sc.cancel() self.clock.advance(3) self._check(None, None) self.assertTrue(self.sc.schedule(1)) self.clock.advance(1) self._check(self.default_args, self.default_kwargs) def test_overwrite(self): over_args = ('crawlmi', ) over_kwargs = {'a': 50, 'd': 'e'} self.sc.schedule(5, *over_args, **over_kwargs) self.clock.advance(5) self._check(over_args, over_kwargs) def test_partial_overwrite(self): over_args = ('crawlmi', ) self.sc.schedule(5, *over_args) self.clock.advance(5) self._check(over_args, {}) def test_nested_schedule(self): def func(*args, **kwargs): self.obj.func(*args, **kwargs) self.sc.schedule() self.sc.func = func self.sc.schedule() self.assertEqual(self.obj.num_calls, 0) self.clock.advance(0) self.assertEqual(self.obj.num_calls, 1) self.clock.advance(0) self.assertEqual(self.obj.num_calls, 2) def test_nested_schedule_delay(self): args1 = ('a', ) kwargs1 = {'a': 'b'} args2 = ('b', ) kwargs2 = {'b': 'c'} def func(*args, **kwargs): self.obj.func(*args, **kwargs) self.sc.schedule(4, *args2, **kwargs2) self.sc.func = func self.sc.schedule(3, *args1, **kwargs1) self.clock.advance(3) self.assertIsNotNone(self.sc._call) self._check(args1, kwargs1) self.clock.advance(3) self._check(args1, kwargs1) self.clock.advance(1) self._check(args2, kwargs2)