class GeneralTest(unittest.TestCase): def setUp(self): self.settings = Settings({ 'DOWNLOAD_HANDLERS': { 'file': 'crawlmi.core.handlers.FileDownloadHandler', 'http': 'crawlmi.core.handlers.HttpDownloadHandler', 'https': 'crawlmi.tests.test_downloader_handlers.NonConfiguredHandler', } }) self.handler = GeneralHandler(self.settings) def test_init(self): self.assertIsInstance(self.handler._handlers['file'], FileDownloadHandler) self.assertIsInstance(self.handler._handlers['http'], HttpDownloadHandler) self.assertIn('https', self.handler._not_configured) def test_get_handler(self): h = self.handler._get_handler(Request('file:///etc/fstab')) self.assertIsInstance(h, FileDownloadHandler) h = self.handler._get_handler(Request('http://www.github.com/')) self.assertIsInstance(h, HttpDownloadHandler) self.assertRaises(NotSupported, self.handler._get_handler, Request('https://www.githib.com/'))
def setUp(self): self.settings = Settings({ 'DOWNLOAD_HANDLERS': { 'file': 'crawlmi.core.handlers.FileDownloadHandler', 'http': 'crawlmi.core.handlers.HttpDownloadHandler', 'https': 'crawlmi.tests.test_downloader_handlers.NonConfiguredHandler', } }) self.handler = GeneralHandler(self.settings)
def __init__(self, settings, request_queue, response_queue, download_handler=None, clock=None): self.request_queue = request_queue self.response_queue = response_queue # queue of responses self.download_handler = download_handler or GeneralHandler(settings) self.slots = {} self.num_in_progress = 0 self.clock = clock or reactor self.processing = LoopingCall(self.process, clock=self.clock) self.processing.schedule(self.QUEUE_CHECK_FREQUENCY, now=True) self.running = True self.download_delay = settings.get_float('DOWNLOAD_DELAY') self.randomize_delay = settings.get_int( 'RANDOMIZE_DOWNLOAD_DELAY') if self.download_delay: self.total_concurrency = self.domain_concurrency = 1 self.use_domain_specific = False else: self.total_concurrency = settings.get_int( 'CONCURRENT_REQUESTS') self.domain_concurrency = settings.get_int( 'CONCURRENT_REQUESTS_PER_DOMAIN') if (not self.domain_concurrency or self.domain_concurrency >= self.total_concurrency): self.use_domain_specific = False self.domain_concurrency = self.total_concurrency else: self.use_domain_specific = True