def __init__(self, path='.'): self.inside_project = False self.cfg_path = self.get_closest_cfg(path=path) self.cfg = None self.project_dir = None self.module_settings = Settings() # user-defined settings file self._data_dir = None if self.cfg_path: # init project dir self.project_dir = os.path.dirname(self.cfg_path) if self.project_dir not in sys.path: sys.path.append(self.project_dir) # init cfg self.cfg = SafeConfigParser() self.cfg.read([self.cfg_path]) # init settings if self.cfg.has_option('crawlmi', 'settings'): settings_module_path = self.cfg.get('crawlmi', 'settings') try: settings_module = __import__(settings_module_path, {}, {}, ['']) except ImportError as exc: warnings.warn( 'Cannot import crawlmi settings module %s: %s' % (settings_module_path, exc)) else: self.module_settings = Settings.from_module( settings_module) self.inside_project = True
def setUp(self): self.clock = Clock() self.request_queue = MemoryQueue() self.response_queue = ResponseQueue() self.dwn = Downloader(Settings(self.default_settings), self.request_queue, self.response_queue, download_handler=MockDownloaderHandler( Settings()), clock=self.clock) self.handler = self.dwn.download_handler
def _update_dwn(self, **kwargs): '''Update downloader with the new settings. ''' new_settings = self.default_settings.copy() new_settings.update(**kwargs) self.dwn.processing.cancel() self.dwn = Downloader(Settings(new_settings), self.request_queue, self.response_queue, download_handler=MockDownloaderHandler( Settings()), clock=self.clock) self.handler = self.dwn.download_handler
def __init__(self, path='.'): self.inside_project = False self.cfg_path = self.get_closest_cfg(path=path) self.cfg = None self.project_dir = None self.module_settings = Settings() # user-defined settings file self._data_dir = None if self.cfg_path: # init project dir self.project_dir = os.path.dirname(self.cfg_path) if self.project_dir not in sys.path: sys.path.append(self.project_dir) # init cfg self.cfg = SafeConfigParser() self.cfg.read([self.cfg_path]) # init settings if self.cfg.has_option('crawlmi', 'settings'): settings_module_path = self.cfg.get('crawlmi', 'settings') try: settings_module = __import__(settings_module_path, {}, {}, ['']) except ImportError as exc: warnings.warn( 'Cannot import crawlmi settings module %s: %s' % (settings_module_path, exc)) else: self.module_settings = Settings.from_module(settings_module) self.inside_project = True
def setUp(self): self.tmpname = self.mktemp() fd = open(self.tmpname + '^', 'wb') fd.write('0123456789') fd.close() self.download_request = FileDownloadHandler( Settings()).download_request
def get_settings(self, args, options): '''Return command specific settings. Default behavior is to combine `command_settings` with the settings received from command line. `self.engine` is still not initialized, so don't use it. ''' custom_settings = self.command_settings.copy() if hasattr(options, 'set'): try: custom_settings.update(arglist_to_dict(options.set)) except ValueError: raise UsageError('Invalid -s value, use -s NAME=VALUE', print_help=False) # logging behavior if getattr(options, 'logfile', None): custom_settings['LOG_ENABLED'] = True custom_settings['LOG_FILE'] = options.logfile if getattr(options, 'loglevel', None): custom_settings['LOG_ENABLED'] = True custom_settings['LOG_LEVEL'] = options.loglevel if getattr(options, 'nolog', None): custom_settings['LOG_ENABLED'] = False return Settings(custom_settings)
def test_get_commands(self): settings = Settings({ 'COMMAND_MODULES': ['crawlmi.tests.test_cmdline.sample_commands'] }) commands = get_commands(settings, False) self.assertNotIn('command1', commands) self.assertIn('command2', commands)
def setUp(self): self.handler = MockDownloaderHandler(Settings()) self.clock = Clock() self.slot = Slot(self.handler, self.default_concurrency, self.default_delay, self.default_randomize_delay, clock=self.clock)
def setUp(self): site = server.Site(UriResource(), timeout=None) wrapper = WrappingFactory(site) self.port = reactor.listenTCP(0, wrapper, interface='127.0.0.1') self.portno = self.port.getHost().port self.download_handler = HttpDownloadHandler( Settings({'CONCURRENT_REQUESTS_PER_DOMAIN': 8})) self.download_request = self.download_handler.download_request
def setUp(self): orig_spiders_dir = os.path.join(module_dir, 'test_spiders') self.tmpdir = self.mktemp() os.mkdir(self.tmpdir) self.spiders_dir = os.path.join(self.tmpdir, 'test_spiders_xxx') shutil.copytree(orig_spiders_dir, self.spiders_dir) sys.path.append(self.tmpdir) settings = {'SPIDER_MODULES': ['test_spiders_xxx']} self.manager = SpiderManager(Settings(settings)) self.lw = LogWrapper() self.lw.setUp()
def setUp(self): self.settings = Settings({ 'DOWNLOAD_HANDLERS': { 'file': 'crawlmi.core.handlers.FileDownloadHandler', 'http': 'crawlmi.core.handlers.HttpDownloadHandler', 'https': 'crawlmi.tests.test_downloader_handlers.NonConfiguredHandler', } }) self.handler = GeneralHandler(self.settings)
def test_failure(self): self.slot.download_handler = FailureDownloaderHandler(Settings()) download_values = [] def downloaded(value): download_values.append(value) for i in xrange(2): r, dfd = get_request(str(i)) dfd.addBoth(downloaded) self.slot.enqueue(r, dfd) self.assertEqual(len(download_values), 2) self.assertIsInstance(download_values[0], Failure) self.assertIsInstance(download_values[1], Failure) self.assertIsNot(download_values[0], download_values[1]) self.assertIsInstance(download_values[0].value, ValueError)
def setUp(self): name = self.mktemp() os.mkdir(name) FilePath(name).child('file').setContent('0123456789') r = static.File(name) r.putChild('redirect', util.Redirect('/file')) r.putChild('wait', ForeverTakingResource()) r.putChild('hang-after-headers', ForeverTakingResource(write=True)) r.putChild('nolength', NoLengthResource()) r.putChild('host', HostHeaderResource()) r.putChild('payload', PayloadResource()) r.putChild('broken', BrokenDownloadResource()) self.site = server.Site(r, timeout=None) self.wrapper = WrappingFactory(self.site) self.port = reactor.listenTCP(0, self.wrapper, interface='127.0.0.1') self.portno = self.port.getHost().port self.download_handler = self.download_handler_cls( Settings({'CONCURRENT_REQUESTS_PER_DOMAIN': 8})) self.download_request = self.download_handler.download_request
def test_from_module(self): s = Settings.from_module('crawlmi.settings.default_settings') self.assertIn('CONCURRENT_REQUESTS', s) self.assertNotIn('blabla', s) self.assertNotIn('__name__', s)
def test_copy(self): s1 = Settings({'a': 'b'}) s2 = s1.copy() self.assertIsInstance(s2, Settings) self.assertIsNot(s1, s2) self.assertDictEqual(s1.values, s2.values)
def test_exception(self): self.slot.download_handler = ExceptionDownloaderHandler(Settings()) r1, dfd1 = get_request('1') self.slot.enqueue(r1, dfd1) return self.assertFailure(dfd1, Exception)
def setUp(self): values = {} for (k, v) in self.tests.iteritems(): values[k] = v[0] self.settings = Settings(values)
class SettingsTest(unittest.TestCase): tests = { 'BOOL_TRUE_1': (1, True), 'BOOL_TRUE_2': ('1', True), 'BOOL_TRUE_3': (True, True), 'BOOL_FALSE_1': (0, False), 'BOOL_FALSE_2': ('0', False), 'BOOL_FALSE_3': (False, False), 'BOOL_FALSE_4': (None, False), 'INT_1': (0, 0), 'INT_2': ('1', 1), 'INT_3': ('-1', -1), 'INT_4': (4.5, 4), 'FLOAT_1': (0.0, 0.0), 'FLOAT_2': ('5.4', 5.4), 'FLOAT_3': (47, 47.0), 'FLOAT_4': ('47.3', 47.3), 'LIST_1': (['one', 'two'], ['one', 'two']), 'LIST_2': ('one,two', ['one', 'two']), 'LIST_3': ('one', ['one']), 'LIST_4': ('', ['']), } def setUp(self): values = {} for (k, v) in self.tests.iteritems(): values[k] = v[0] self.settings = Settings(values) def test_from_module(self): s = Settings.from_module('crawlmi.settings.default_settings') self.assertIn('CONCURRENT_REQUESTS', s) self.assertNotIn('blabla', s) self.assertNotIn('__name__', s) def test_copy(self): s1 = Settings({'a': 'b'}) s2 = s1.copy() self.assertIsInstance(s2, Settings) self.assertIsNot(s1, s2) self.assertDictEqual(s1.values, s2.values) def _get_answers(self, prefix): result = [] for (k, v) in self.tests.iteritems(): if k.startswith(prefix): result.append((k, v[1])) return result def test_req_or_resp(self): req = Request('http://github.com/', meta={'INT_1': 10, 'a': 'b'}) self.assertEqual(self.settings.get('INT_1', req_or_resp=req), 10) self.assertEqual(self.settings.get('a', req_or_resp=req), 'b') resp = Response('', request=req) self.assertEqual(self.settings.get('INT_1', req_or_resp=resp), 10) self.assertEqual(self.settings.get('a', req_or_resp=resp), 'b') def test_bool(self): for (k, v) in self._get_answers('BOOL'): self.assertIs(self.settings.get_bool(k), v, k) self.assertIs(self.settings.get_bool('invalid'), False) self.assertIs(self.settings.get_bool('invalid', True), True) self.assertRaises(ValueError, self.settings.get_bool, 'invalid', 'hello') def test_int(self): for (k, v) in self._get_answers('INT'): self.assertEqual(self.settings.get_int(k), v, k) self.assertEqual(self.settings.get_int('invalid'), 0) self.assertEqual(self.settings.get_int('invalid', 12), 12) self.assertRaises(ValueError, self.settings.get_int, 'invalid', 'hello') def test_float(self): for (k, v) in self._get_answers('FLOAT'): self.assertEqual(self.settings.get_float(k), v, k) self.assertEqual(self.settings.get_float('invalid'), 0.0) self.assertEqual(self.settings.get_float('invalid', 12.3), 12.3) self.assertRaises(ValueError, self.settings.get_float, 'invalid', 'hello') def test_list(self): for (k, v) in self._get_answers('LIST'): self.assertEqual(self.settings.get_list(k), v, k) self.assertEqual(self.settings.get_list('invalid'), []) listA = [1, 2, 3] self.assertIs(self.settings.get_list('invalid', listA), listA) listB = [] self.assertIs(self.settings.get_list('invalid', listB), listB) def test_keys(self): keys = self.settings.keys() self.assertIsInstance(keys, list) self.assertEqual(len(keys), len(set(keys))) self.assertSetEqual(set(keys), set(self.tests.keys()))
def test_load_base_spider(self): settings = {'SPIDER_MODULES': ['crawlmi.tests.test_spider_manager.test_spiders.spider0']} self.manager = SpiderManager(Settings(settings)) self.assertEqual(len(self.manager._spiders), 0)