示例#1
0
class LogStats(object):
    def __init__(self, engine, clock=None):
        self.interval = engine.settings.get_float('LOG_STATS_INTERVAL')
        if not self.interval:
            raise NotConfigured
        self.multiplier = 60.0 / self.interval
        self.logging = LoopingCall(self.log, clock=clock)

        engine.signals.connect(self.engine_started,
                               signal=signals.engine_started)
        engine.signals.connect(self.engine_stopped,
                               signal=signals.engine_stopped)
        engine.signals.connect(self.response_downloaded,
                               signal=signals.response_downloaded)

        self.downloaded = 0
        self.downloaded_prev = 0

    def engine_started(self):
        self.logging.schedule(self.interval)

    def engine_stopped(self):
        self.logging.cancel()

    def response_downloaded(self):
        self.downloaded += 1

    def log(self):
        downloaded_speed = (self.downloaded -
                            self.downloaded_prev) * self.multiplier
        self.downloaded_prev = self.downloaded
        log.msg(format='Crawled %(down)d pages (at %(speed)d pages/min).',
                level=log.INFO,
                down=self.downloaded,
                speed=downloaded_speed)
示例#2
0
class LogStats(object):
    def __init__(self, engine, clock=None):
        self.interval = engine.settings.get_float('LOG_STATS_INTERVAL')
        if not self.interval:
            raise NotConfigured
        self.multiplier = 60.0 / self.interval
        self.logging = LoopingCall(self.log, clock=clock)

        engine.signals.connect(self.engine_started, signal=signals.engine_started)
        engine.signals.connect(self.engine_stopped, signal=signals.engine_stopped)
        engine.signals.connect(self.response_downloaded, signal=signals.response_downloaded)

        self.downloaded = 0
        self.downloaded_prev = 0

    def engine_started(self):
        self.logging.schedule(self.interval)

    def engine_stopped(self):
        self.logging.cancel()

    def response_downloaded(self):
        self.downloaded += 1

    def log(self):
        downloaded_speed = (self.downloaded - self.downloaded_prev) * self.multiplier
        self.downloaded_prev = self.downloaded
        log.msg(format='Crawled %(down)d pages (at %(speed)d pages/min).',
                level=log.INFO, down=self.downloaded, speed=downloaded_speed)
示例#3
0
 def test_init(self):
     # test initializing LoopingCall without overriding its clock
     sc = LoopingCall(self.obj.func, *self.default_args,
                        **self.default_kwargs)
     sc.schedule()
     sc.cancel()
示例#4
0
class LoopingCallTest(unittest.TestCase):
    default_args = (10, 'hello')
    default_kwargs = {'a': 47, 'b': 'c'}

    def setUp(self):
        self.clock = Clock()
        self.obj = ModifiedObject()
        self.sc = LoopingCall(self.obj.func, clock=self.clock,
                              *self.default_args,
                              **self.default_kwargs)

    def _check(self, args, kwargs):
        if args is None:
            self.assertIsNone(self.obj.args)
        else:
            self.assertTupleEqual(self.obj.args, args)

        if kwargs is None:
            self.assertIsNone(self.obj.kwargs)
        else:
            self.assertEqual(self.obj.kwargs, kwargs)

    def test_init(self):
        # test initializing LoopingCall without overriding its clock
        sc = LoopingCall(self.obj.func, *self.default_args,
                           **self.default_kwargs)
        sc.schedule()
        sc.cancel()

    def test_basic(self):
        # scheduling
        self.assertFalse(self.sc.is_scheduled())
        self.sc.schedule(2, count=2, now=False)
        # before the first call
        self.assertTrue(self.sc.is_scheduled())
        self.assertEqual(self.sc.get_time(), 2)
        self.assertEqual(self.sc.calls_left(), 2)
        self.clock.advance(1)
        self.assertEqual(self.obj.num_calls, 0)
        # after the first call
        self.clock.advance(1)
        self.assertEqual(self.obj.num_calls, 1)
        self.assertEqual(self.sc.calls_left(), 1)
        self._check(self.default_args, self.default_kwargs)
        # after the second call
        self.clock.advance(2)
        self.assertEqual(self.obj.num_calls, 2)
        self.assertEqual(self.sc.calls_left(), 0)
        self.assertFalse(self.sc.is_scheduled())
        # no more calls
        self.clock.advance(20)
        self.assertEqual(self.obj.num_calls, 2)

    def test_now(self):
        self.sc.schedule(2, count=2, now=True)
        self.assertEqual(self.obj.num_calls, 0)
        self.clock.advance(0)
        self.assertEqual(self.obj.num_calls, 1)
        self.clock.advance(20)
        self.assertEqual(self.obj.num_calls, 2)

    def test_infinite(self):
        self.sc.schedule(2)
        self.clock.pump([2] * 100)
        self.assertEqual(self.obj.num_calls, 100)
        self.assertTrue(self.sc.is_scheduled())
        self.assertIsNone(self.sc.calls_left())

    def test_cancel(self):
        self.sc.schedule(2)
        self.clock.advance(1)
        self.sc.cancel()
        self.clock.advance(20)
        self.assertEqual(self.obj.num_calls, 0)

    def test_reschedule(self):
        self.sc.schedule(2)
        self.clock.advance(1)
        self.sc.schedule(5)
        self.clock.advance(4)
        self.assertEqual(self.obj.num_calls, 0)
        self.clock.advance(1)
        self.assertEqual(self.obj.num_calls, 1)

    def test_no_delay(self):
        self.sc.schedule()
        self.assertEqual(self.obj.num_calls, 0)
        self.clock.advance(0)
        self.assertEqual(self.obj.num_calls, 1)
        self.clock.advance(0)
        self.assertEqual(self.obj.num_calls, 2)

    def test_nested_schedule(self):
        def func(*args, **kwargs):
            self.obj.func(*args, **kwargs)
            self.sc.schedule()
        self.sc.func = func
        self.sc.schedule()
        self.assertEqual(self.obj.num_calls, 0)
        self.clock.advance(0)
        self.assertEqual(self.obj.num_calls, 1)
        self.clock.advance(0)
        self.assertEqual(self.obj.num_calls, 2)
 def test_init(self):
     # test initializing LoopingCall without overriding its clock
     sc = LoopingCall(self.obj.func, *self.default_args,
                      **self.default_kwargs)
     sc.schedule()
     sc.cancel()
class LoopingCallTest(unittest.TestCase):
    default_args = (10, 'hello')
    default_kwargs = {'a': 47, 'b': 'c'}

    def setUp(self):
        self.clock = Clock()
        self.obj = ModifiedObject()
        self.sc = LoopingCall(self.obj.func,
                              clock=self.clock,
                              *self.default_args,
                              **self.default_kwargs)

    def _check(self, args, kwargs):
        if args is None:
            self.assertIsNone(self.obj.args)
        else:
            self.assertTupleEqual(self.obj.args, args)

        if kwargs is None:
            self.assertIsNone(self.obj.kwargs)
        else:
            self.assertEqual(self.obj.kwargs, kwargs)

    def test_init(self):
        # test initializing LoopingCall without overriding its clock
        sc = LoopingCall(self.obj.func, *self.default_args,
                         **self.default_kwargs)
        sc.schedule()
        sc.cancel()

    def test_basic(self):
        # scheduling
        self.assertFalse(self.sc.is_scheduled())
        self.sc.schedule(2, count=2, now=False)
        # before the first call
        self.assertTrue(self.sc.is_scheduled())
        self.assertEqual(self.sc.get_time(), 2)
        self.assertEqual(self.sc.calls_left(), 2)
        self.clock.advance(1)
        self.assertEqual(self.obj.num_calls, 0)
        # after the first call
        self.clock.advance(1)
        self.assertEqual(self.obj.num_calls, 1)
        self.assertEqual(self.sc.calls_left(), 1)
        self._check(self.default_args, self.default_kwargs)
        # after the second call
        self.clock.advance(2)
        self.assertEqual(self.obj.num_calls, 2)
        self.assertEqual(self.sc.calls_left(), 0)
        self.assertFalse(self.sc.is_scheduled())
        # no more calls
        self.clock.advance(20)
        self.assertEqual(self.obj.num_calls, 2)

    def test_now(self):
        self.sc.schedule(2, count=2, now=True)
        self.assertEqual(self.obj.num_calls, 0)
        self.clock.advance(0)
        self.assertEqual(self.obj.num_calls, 1)
        self.clock.advance(20)
        self.assertEqual(self.obj.num_calls, 2)

    def test_infinite(self):
        self.sc.schedule(2)
        self.clock.pump([2] * 100)
        self.assertEqual(self.obj.num_calls, 100)
        self.assertTrue(self.sc.is_scheduled())
        self.assertIsNone(self.sc.calls_left())

    def test_cancel(self):
        self.sc.schedule(2)
        self.clock.advance(1)
        self.sc.cancel()
        self.clock.advance(20)
        self.assertEqual(self.obj.num_calls, 0)

    def test_reschedule(self):
        self.sc.schedule(2)
        self.clock.advance(1)
        self.sc.schedule(5)
        self.clock.advance(4)
        self.assertEqual(self.obj.num_calls, 0)
        self.clock.advance(1)
        self.assertEqual(self.obj.num_calls, 1)

    def test_no_delay(self):
        self.sc.schedule()
        self.assertEqual(self.obj.num_calls, 0)
        self.clock.advance(0)
        self.assertEqual(self.obj.num_calls, 1)
        self.clock.advance(0)
        self.assertEqual(self.obj.num_calls, 2)

    def test_nested_schedule(self):
        def func(*args, **kwargs):
            self.obj.func(*args, **kwargs)
            self.sc.schedule()

        self.sc.func = func
        self.sc.schedule()
        self.assertEqual(self.obj.num_calls, 0)
        self.clock.advance(0)
        self.assertEqual(self.obj.num_calls, 1)
        self.clock.advance(0)
        self.assertEqual(self.obj.num_calls, 2)
示例#7
0
class Downloader(object):
    '''Fetch requests from `request_queue` queue. When downloaded,
    put the results into `response_queue` queue. Respect CONCURRENT_REQUESTS
    setting.
    Requests are further divided into specific slots, based on their domains.
    '''

    # how many seconds to wait between the checks of request_queue
    QUEUE_CHECK_FREQUENCY = 0.1

    def __init__(self, settings, request_queue, response_queue,
                 download_handler=None, clock=None):
        self.request_queue = request_queue
        self.response_queue = response_queue  # queue of responses
        self.download_handler = download_handler or GeneralHandler(settings)
        self.slots = {}
        self.num_in_progress = 0
        self.clock = clock or reactor
        self.processing = LoopingCall(self.process, clock=self.clock)
        self.processing.schedule(self.QUEUE_CHECK_FREQUENCY, now=True)
        self.running = True

        self.download_delay = settings.get_int('DOWNLOAD_DELAY')
        self.randomize_delay = settings.get_int(
            'RANDOMIZE_DOWNLOAD_DELAY')
        if self.download_delay:
            self.total_concurrency = self.domain_concurrency = 1
            self.use_domain_specific = False
        else:
            self.total_concurrency = settings.get_int(
                'CONCURRENT_REQUESTS')
            self.domain_concurrency = settings.get_int(
                'CONCURRENT_REQUESTS_PER_DOMAIN')
            if (not self.domain_concurrency or
                    self.domain_concurrency >= self.total_concurrency):
                self.use_domain_specific = False
                self.domain_concurrency = self.total_concurrency
            else:
                self.use_domain_specific = True

    def close(self):
        self.processing.cancel()
        self.running = False

    @property
    def free_slots(self):
        return self.total_concurrency - self.num_in_progress

    def is_idle(self):
        return self.num_in_progress == 0

    def process(self):
        while (self.running and not self.response_queue.needs_backout() and
                self.request_queue and self.free_slots > 0):
            request = self.request_queue.pop()
            key, slot = self._get_slot(request)

            def remove_in_progress(response):
                self.num_in_progress -= 1
                self._clear_slots()  # clear empty slots
                return response

            def enqueue_result(request, result):
                # in a case, result is actually a Failure
                result.request = request
                # make sure not to modify response_queue, after stopping the downloader
                if self.running:
                    self.response_queue.push(result)
                # don't return anything from here, in a case an error occured -
                # we don't want it to be logged

            self.num_in_progress += 1
            dfd = defer.Deferred().addBoth(remove_in_progress)
            dfd.addBoth(partial(enqueue_result, request))
            slot.enqueue(request, dfd)

    def _get_slot(self, request):
        key = request.parsed_url.hostname if self.use_domain_specific else ''
        if key not in self.slots:
            self.slots[key] = Slot(
                self.download_handler,
                self.domain_concurrency,
                self.download_delay,
                self.randomize_delay,
                clock=self.clock)
        return key, self.slots[key]

    def _clear_slots(self):
        '''Clear unused slots and avoid memory leaking.'''
        if len(self.slots) >= 2 * self.total_concurrency:
            to_delete = [k for (k, v) in self.slots.iteritems() if v.is_idle()]
            for key in to_delete:
                del self.slots[key]
示例#8
0
class Downloader(object):
    '''Fetch requests from `request_queue` queue. When downloaded,
    put the results into `response_queue` queue. Respect CONCURRENT_REQUESTS
    setting.
    Requests are further divided into specific slots, based on their domains.
    '''

    # how many seconds to wait between the checks of request_queue
    QUEUE_CHECK_FREQUENCY = 0.1

    def __init__(self, settings, request_queue, response_queue,
                 download_handler=None, clock=None):
        self.request_queue = request_queue
        self.response_queue = response_queue  # queue of responses
        self.download_handler = download_handler or GeneralHandler(settings)
        self.slots = {}
        self.num_in_progress = 0
        self.clock = clock or reactor
        self.processing = LoopingCall(self.process, clock=self.clock)
        self.processing.schedule(self.QUEUE_CHECK_FREQUENCY, now=True)
        self.running = True

        self.download_delay = settings.get_float('DOWNLOAD_DELAY')
        self.randomize_delay = settings.get_int(
            'RANDOMIZE_DOWNLOAD_DELAY')
        if self.download_delay:
            self.total_concurrency = self.domain_concurrency = 1
            self.use_domain_specific = False
        else:
            self.total_concurrency = settings.get_int(
                'CONCURRENT_REQUESTS')
            self.domain_concurrency = settings.get_int(
                'CONCURRENT_REQUESTS_PER_DOMAIN')
            if (not self.domain_concurrency or
                    self.domain_concurrency >= self.total_concurrency):
                self.use_domain_specific = False
                self.domain_concurrency = self.total_concurrency
            else:
                self.use_domain_specific = True

    def close(self):
        self.processing.cancel()
        self.running = False

    @property
    def free_slots(self):
        return self.total_concurrency - self.num_in_progress

    def is_idle(self):
        return self.num_in_progress == 0

    def process(self):
        while (self.running and not self.response_queue.needs_backout() and
                self.request_queue and self.free_slots > 0):
            request = self.request_queue.pop()
            key, slot = self._get_slot(request)

            def remove_in_progress(response):
                self.num_in_progress -= 1
                self._clear_slots()  # clear empty slots
                return response

            def enqueue_result(request, result):
                # in a case, result is actually a Failure
                result.request = request
                # make sure not to modify response_queue, after stopping the downloader
                if self.running:
                    self.response_queue.push(result)
                # don't return anything from here, in a case an error occured -
                # we don't want it to be logged

            self.num_in_progress += 1
            dfd = defer.Deferred().addBoth(remove_in_progress)
            dfd.addBoth(partial(enqueue_result, request))
            slot.enqueue(request, dfd)

    def _get_slot(self, request):
        key = request.parsed_url.hostname if self.use_domain_specific else ''
        if key not in self.slots:
            self.slots[key] = Slot(
                self.download_handler,
                self.domain_concurrency,
                self.download_delay,
                self.randomize_delay,
                clock=self.clock)
        return key, self.slots[key]

    def _clear_slots(self):
        '''Clear unused slots and avoid memory leaking.'''
        if len(self.slots) >= 2 * self.total_concurrency:
            to_delete = [k for (k, v) in self.slots.iteritems() if v.is_idle()]
            for key in to_delete:
                del self.slots[key]