def test_build_spider_config_empty(self): class TestSpider(Spider): pass root_cfg = build_root_config('tests.files.settings_minimal') cfg = build_spider_config(TestSpider, root_cfg) self.assertEqual(cfg, DEFAULT_SPIDER_GLOBAL_CONFIG)
def test_build_spider_config_empty(self): class TestSpider(Spider): pass root_cfg = build_root_config('test.files.settings_minimal') cfg = build_spider_config(TestSpider, root_cfg) self.assertEqual(cfg, DEFAULT_SPIDER_GLOBAL_CONFIG)
def test_build_spider_config1(self): modname = setup_settings_file({}) default_config.default_config = { 'CACHE': {'backend': 'mysql'}, 'VAR1': 'val1', } config = build_global_config(modname) spider_config = build_spider_config('foo', config) self.assertEqual(config['CACHE'], {'backend': 'mysql'})
def test_build_spider_config1(self): modname = setup_settings_file({}) default_config.default_config = { 'CACHE': { 'backend': 'mysql' }, 'VAR1': 'val1', } config = build_global_config(modname) spider_config = build_spider_config('foo', config) self.assertEqual(config['CACHE'], {'backend': 'mysql'})
def test_build_spider_config_overwrite(self): class TestSpider(Spider): pass root_cfg = build_root_config('test.files.settings_test_spider') cfg = build_spider_config(TestSpider, root_cfg) for key, val in DEFAULT_SPIDER_GLOBAL_CONFIG.items(): if key == 'spider_modules': self.assertEqual(cfg[key], ['zzz']) elif key == 'thread_number': self.assertEqual(cfg[key], 777) else: self.assertEqual(cfg[key], val)
def test_build_spider_config2(self): modname = setup_settings_file({}) default_config.default_config = { 'CACHE': {'backend': 'mysql'}, 'SPIDER_CONFIG_FOO': { 'CACHE': {'backend': 'tokyo'}, }, } config = build_global_config(modname) spider_config = build_spider_config('foo', config) self.assertEqual(spider_config['CACHE'], {'backend': 'tokyo'})
def main(spider_name, thread_number=None, slave=False, force_url=None, settings='settings', *args, **kwargs): default_logging(propagate_network_logger=kwargs['propagate_network_logger']) lock_key = None if not slave: lock_key = 'crawl.%s' % spider_name if lock_key is not None: lock_path = 'var/run/%s.lock' % lock_key logger.debug('Trying to lock file: %s' % lock_path) assert_lock(lock_path) config = build_global_config(settings) spider_class = load_spider_class(config, spider_name) spider_config = build_spider_config(spider_class, config) if thread_number is None: thread_number = spider_config.getint('GRAB_THREAD_NUMBER') bot = spider_class( thread_number=thread_number, slave=slave, config=spider_config, network_try_limit=spider_config.getint('GRAB_NETWORK_TRY_LIMIT'), task_try_limit=spider_config.getint('GRAB_TASK_TRY_LIMIT'), ) if spider_config.get('GRAB_QUEUE'): bot.setup_queue(**spider_config['GRAB_QUEUE']) if spider_config.get('GRAB_CACHE'): bot.setup_cache(**spider_config['GRAB_CACHE']) if spider_config.get('GRAB_PROXY_LIST'): bot.load_proxylist(**spider_config['GRAB_PROXY_LIST']) try: bot.run() except KeyboardInterrupt: pass stats = bot.render_stats(timing=config.get('GRAB_DISPLAY_TIMING')) if config.get('GRAB_DISPLAY_STATS'): logger.debug(stats) pid = os.getpid() logger.debug('Spider pid is %d' % pid) if config.get('GRAB_SAVE_FATAL_ERRORS'): bot.save_list('fatal', 'var/fatal-%d.txt' % pid) if config.get('GRAB_SAVE_TASK_ADD_ERRORS'): bot.save_list('task-could-not-be-added', 'var/task-add-error-%d.txt' % pid) if config.get('GRAB_SAVE_FINAL_STATS'): open('var/stats-%d.txt' % pid, 'wb').write(stats)
def test_setup_spider_config(self): class TestSpider(Spider): @classmethod def setup_spider_config(cls, config): config['foo'] = 'bar' root_cfg = build_root_config('tests.files.settings_minimal') cfg = build_spider_config(TestSpider, root_cfg) for key, val in DEFAULT_SPIDER_GLOBAL_CONFIG.items(): if key == 'foo': self.assertEqual(cfg[key], 'bar') else: self.assertEqual(cfg[key], val)
def test_setup_spider_config(self): class TestSpider(Spider): @classmethod def setup_spider_config(cls, config): config['foo'] = 'bar' root_cfg = build_root_config('test.files.settings_minimal') cfg = build_spider_config(TestSpider, root_cfg) for key, val in DEFAULT_SPIDER_GLOBAL_CONFIG.items(): if key == 'foo': self.assertEqual(cfg[key], 'bar') else: self.assertEqual(cfg[key], val)
def test_build_spider_config2(self): modname = setup_settings_file({}) default_config.default_config = { 'CACHE': { 'backend': 'mysql' }, 'SPIDER_CONFIG_FOO': { 'CACHE': { 'backend': 'tokyo' }, }, } config = build_global_config(modname) spider_config = build_spider_config('foo', config) self.assertEqual(spider_config['CACHE'], {'backend': 'tokyo'})
def build_spider_instance(cls, settings_module): root_config = build_root_config(settings_module) spider_config = build_spider_config(cls, root_config) return cls(config=spider_config)
def main(spider_name, thread_number=None, settings_module='settings', network_logs=False, disable_proxy=False, ignore_lock=False, disable_report=False, api_port=None, parser_pool_size=2, grab_log_file=None, network_log_file=None, network_service=None, grab_transport=None, **kwargs): # pylint: disable=unused-argument default_logging( grab_log=grab_log_file, network_log=network_log_file, propagate_network_logger=network_logs, ) root_config = build_root_config(settings_module) spider_class = load_spider_class(root_config, spider_name) spider_config = build_spider_config(spider_class, root_config) spider_args = None if hasattr(spider_class, 'setup_arg_parser'): parser = ArgumentParser() spider_class.setup_arg_parser(parser) opts, _ = parser.parse_known_args() spider_args = vars(opts) bot = spider_class( thread_number=thread_number, config=spider_config, network_try_limit=None, task_try_limit=None, args=spider_args, http_api_port=api_port, parser_pool_size=parser_pool_size, network_service=network_service, grab_transport=grab_transport, ) opt_queue = spider_config.get('queue') if opt_queue: bot.setup_queue(**opt_queue) opt_cache = spider_config.get('cache') if opt_cache: bot.setup_cache(**opt_cache) opt_proxy_list = spider_config.get('proxy_list') if opt_proxy_list: if disable_proxy: logger.debug('Proxy servers disabled via command line') else: bot.load_proxylist(**opt_proxy_list) opt_ifaces = spider_config.get('command_interfaces') if opt_ifaces: for iface_config in opt_ifaces: bot.controller.add_interface(**iface_config) try: bot.run() except KeyboardInterrupt: pass stats = bot.render_stats() if spider_config.get('display_stats'): logger.debug(stats) pid = os.getpid() logger.debug('Spider pid is %d', pid) if not disable_report: if spider_config.get('save_report'): for subdir in (str(pid), 'last'): dir_ = 'var/%s' % subdir if not os.path.exists(dir_): os.makedirs(dir_) else: clear_directory(dir_) for key, lst in bot.stat.collections.items(): fname_key = key.replace('-', '_') save_list(lst, '%s/%s.txt' % (dir_, fname_key)) with open('%s/report.txt' % dir_, 'wb') as out: out.write(make_str(stats)) return { 'spider_stats': bot.render_stats(), }
def main(spider_name, thread_number=None, slave=False, settings='settings', network_logs=False, disable_proxy=False, ignore_lock=False, disable_report=False, *args, **kwargs): default_logging(propagate_network_logger=network_logs) root_config = build_root_config(settings) spider_class = load_spider_class(root_config, spider_name) spider_config = build_spider_config(spider_class, root_config) spider_args = None if hasattr(spider_class, 'setup_arg_parser'): parser = ArgumentParser() spider_class.setup_arg_parser(parser) opts, trash = parser.parse_known_args() spider_args = vars(opts) if thread_number is None: thread_number = \ int(spider_config.get('thread_number', deprecated_key='GRAB_THREAD_NUMBER')) stat_task_object = kwargs.get('stat_task_object', None) bot = spider_class( thread_number=thread_number, slave=slave, config=spider_config, network_try_limit=int(spider_config.get( 'network_try_limit', deprecated_key='GRAB_NETWORK_TRY_LIMIT')), task_try_limit=int(spider_config.get( 'task_try_limit', deprecated_key='GRAB_TASK_TRY_LIMIT')), args=spider_args, ) opt_queue = spider_config.get('queue', deprecated_key='GRAB_QUEUE') if opt_queue: bot.setup_queue(**opt_queue) opt_cache = spider_config.get('cache', deprecated_key='GRAB_CACHE') if opt_cache: bot.setup_cache(**opt_cache) opt_proxy_list = spider_config.get( 'proxy_list', deprecated_key='GRAB_PROXY_LIST') if opt_proxy_list: if disable_proxy: logger.debug('Proxy servers disabled via command line') else: bot.load_proxylist(**opt_proxy_list) opt_ifaces = spider_config.get( 'command_interfaces', deprecated_key='GRAB_COMMAND_INTERFACES') if opt_ifaces: for iface_config in opt_ifaces: bot.controller.add_interface(**iface_config) # Dirty hack # FIXIT: REMOVE bot.dump_spider_stats = kwargs.get('dump_spider_stats') bot.stats_object = kwargs.get('stats_object') try: bot.run() except KeyboardInterrupt: pass stats = bot.render_stats( timing=spider_config.get('display_timing', deprecated_key='GRAB_DISPLAY_TIMING')) if spider_config.get('display_stats', deprecated_key='GRAB_DISPLAY_STATS'): logger.debug(stats) pid = os.getpid() logger.debug('Spider pid is %d' % pid) if not disable_report: if spider_config.get('save_report', deprecated_key='GRAB_SAVE_REPORT'): for subdir in (str(pid), 'last'): dir_ = 'var/%s' % subdir if not os.path.exists(dir_): os.mkdir(dir_) else: clear_directory(dir_) for key, lst in bot.items.iteritems(): fname_key = key.replace('-', '_') bot.save_list(key, '%s/%s.txt' % (dir_, fname_key)) with open('%s/report.txt' % dir_, 'wb') as out: out.write(stats) return { 'spider_stats': bot.render_stats(timing=False), 'spider_timing': bot.render_timing(), }
def build_spider_instance(cls, settings_module, **kwargs): root_config = build_root_config(settings_module) spider_config = build_spider_config(cls, root_config) return cls(config=spider_config)
def main(spider_name, thread_number=None, slave=False, settings_module='settings', network_logs=False, disable_proxy=False, ignore_lock=False, disable_report=False, disable_default_logs=False, *args, **kwargs): if disable_default_logs: default_logging(propagate_network_logger=network_logs, grab_log=None, network_log=None) else: default_logging(propagate_network_logger=network_logs) root_config = build_root_config(settings_module) spider_class = load_spider_class(root_config, spider_name) spider_config = build_spider_config(spider_class, root_config) spider_args = None if hasattr(spider_class, 'setup_arg_parser'): parser = ArgumentParser() spider_class.setup_arg_parser(parser) opts, trash = parser.parse_known_args() spider_args = vars(opts) bot = spider_class( thread_number=thread_number, slave=slave, config=spider_config, network_try_limit=None, task_try_limit=None, args=spider_args, ) opt_queue = spider_config.get('queue') if opt_queue: bot.setup_queue(**opt_queue) opt_cache = spider_config.get('cache') if opt_cache: bot.setup_cache(**opt_cache) opt_proxy_list = spider_config.get('proxy_list') if opt_proxy_list: if disable_proxy: logger.debug('Proxy servers disabled via command line') else: bot.load_proxylist(**opt_proxy_list) opt_ifaces = spider_config.get('command_interfaces') if opt_ifaces: for iface_config in opt_ifaces: bot.controller.add_interface(**iface_config) try: bot.run() except KeyboardInterrupt: pass stats = bot.render_stats(timing=spider_config.get('display_timing')) stats_with_time = bot.render_stats(timing=True) if spider_config.get('display_stats'): logger.debug(stats) pid = os.getpid() logger.debug('Spider pid is %d' % pid) if not disable_report: if spider_config.get('save_report'): for subdir in (str(pid), 'last'): dir_ = 'var/%s' % subdir if not os.path.exists(dir_): os.mkdir(dir_) else: clear_directory(dir_) for key, lst in bot.items.items(): fname_key = key.replace('-', '_') bot.save_list(key, '%s/%s.txt' % (dir_, fname_key)) with open('%s/report.txt' % dir_, 'wb') as out: out.write(make_str(stats_with_time)) return { 'spider_stats': bot.render_stats(timing=False), 'spider_timing': bot.render_timing(), }
def main(spider_name, thread_number=None, slave=False, settings='settings', network_logs=False, *args, **kwargs): default_logging(propagate_network_logger=network_logs) lock_key = None if not slave: lock_key = 'crawl.%s' % spider_name if lock_key is not None: lock_path = 'var/run/%s.lock' % lock_key logger.debug('Trying to lock file: %s' % lock_path) assert_lock(lock_path) config = build_global_config(settings) spider_class = load_spider_class(config, spider_name) spider_config = build_spider_config(spider_class, config) if hasattr(spider_class, 'setup_extra_args'): parser = ArgumentParser() spider_class.setup_extra_args(parser) extra_args, trash = parser.parse_known_args() spider_config['extra_args'] = vars(extra_args) if thread_number is None: thread_number = spider_config.getint('GRAB_THREAD_NUMBER') stat_task_object = kwargs.get('stat_task_object', None) bot = spider_class( thread_number=thread_number, slave=slave, config=spider_config, network_try_limit=spider_config.getint('GRAB_NETWORK_TRY_LIMIT'), task_try_limit=spider_config.getint('GRAB_TASK_TRY_LIMIT'), ) if spider_config.get('GRAB_QUEUE'): bot.setup_queue(**spider_config['GRAB_QUEUE']) if spider_config.get('GRAB_CACHE'): bot.setup_cache(**spider_config['GRAB_CACHE']) if spider_config.get('GRAB_PROXY_LIST'): bot.load_proxylist(**spider_config['GRAB_PROXY_LIST']) if spider_config.get('GRAB_COMMAND_INTERFACES'): for iface_config in spider_config['GRAB_COMMAND_INTERFACES']: bot.controller.add_interface(**iface_config) # Dirty hack # FIXIT: REMOVE bot.dump_spider_stats = kwargs.get('dump_spider_stats') bot.stats_object = kwargs.get('stats_object') try: bot.run() except KeyboardInterrupt: pass stats = bot.render_stats(timing=config.get('GRAB_DISPLAY_TIMING')) if config.get('GRAB_DISPLAY_STATS'): logger.debug(stats) pid = os.getpid() logger.debug('Spider pid is %d' % pid) if config.get('GRAB_SAVE_FATAL_ERRORS'): bot.save_list('fatal', 'var/fatal-%d.txt' % pid) if config.get('GRAB_SAVE_TASK_ADD_ERRORS'): bot.save_list('task-could-not-be-added', 'var/task-add-error-%d.txt' % pid) if config.get('GRAB_SAVE_FINAL_STATS'): open('var/stats-%d.txt' % pid, 'wb').write(stats) return { 'spider_stats': bot.render_stats(timing=False), 'spider_timing': bot.render_timing(), }
def main(spider_name, thread_number=None, slave=False, settings='settings', network_logs=False, disable_proxy=False, ignore_lock=False, disable_report=False, *args, **kwargs): default_logging(propagate_network_logger=network_logs) root_config = build_root_config(settings) spider_class = load_spider_class(root_config, spider_name) spider_config = build_spider_config(spider_class, root_config) spider_args = None if hasattr(spider_class, 'setup_arg_parser'): parser = ArgumentParser() spider_class.setup_arg_parser(parser) opts, trash = parser.parse_known_args() spider_args = vars(opts) if thread_number is None: thread_number = \ int(spider_config.get('thread_number', deprecated_key='GRAB_THREAD_NUMBER')) stat_task_object = kwargs.get('stat_task_object', None) bot = spider_class( thread_number=thread_number, slave=slave, config=spider_config, network_try_limit=int( spider_config.get('network_try_limit', deprecated_key='GRAB_NETWORK_TRY_LIMIT')), task_try_limit=int( spider_config.get('task_try_limit', deprecated_key='GRAB_TASK_TRY_LIMIT')), args=spider_args, ) opt_queue = spider_config.get('queue', deprecated_key='GRAB_QUEUE') if opt_queue: bot.setup_queue(**opt_queue) opt_cache = spider_config.get('cache', deprecated_key='GRAB_CACHE') if opt_cache: bot.setup_cache(**opt_cache) opt_proxy_list = spider_config.get('proxy_list', deprecated_key='GRAB_PROXY_LIST') if opt_proxy_list: if disable_proxy: logger.debug('Proxy servers disabled via command line') else: bot.load_proxylist(**opt_proxy_list) opt_ifaces = spider_config.get('command_interfaces', deprecated_key='GRAB_COMMAND_INTERFACES') if opt_ifaces: for iface_config in opt_ifaces: bot.controller.add_interface(**iface_config) # Dirty hack # FIXIT: REMOVE bot.dump_spider_stats = kwargs.get('dump_spider_stats') bot.stats_object = kwargs.get('stats_object') try: bot.run() except KeyboardInterrupt: pass stats = bot.render_stats(timing=spider_config.get( 'display_timing', deprecated_key='GRAB_DISPLAY_TIMING')) if spider_config.get('display_stats', deprecated_key='GRAB_DISPLAY_STATS'): logger.debug(stats) pid = os.getpid() logger.debug('Spider pid is %d' % pid) if not disable_report: if spider_config.get('save_report', deprecated_key='GRAB_SAVE_REPORT'): for subdir in (str(pid), 'last'): dir_ = 'var/%s' % subdir if not os.path.exists(dir_): os.mkdir(dir_) else: clear_directory(dir_) for key, lst in bot.items.iteritems(): fname_key = key.replace('-', '_') bot.save_list(key, '%s/%s.txt' % (dir_, fname_key)) with open('%s/report.txt' % dir_, 'wb') as out: out.write(stats) return { 'spider_stats': bot.render_stats(timing=False), 'spider_timing': bot.render_timing(), }
def main(spider_name, thread_number=None, slave=False, settings='settings', network_logs=False, disable_proxy=False, *args, **kwargs): default_logging(propagate_network_logger=network_logs) lock_key = None if not slave: lock_key = 'crawl.%s' % spider_name if lock_key is not None: lock_path = 'var/run/%s.lock' % lock_key logger.debug('Trying to lock file: %s' % lock_path) assert_lock(lock_path) config = build_global_config(settings) spider_class = load_spider_class(config, spider_name) spider_config = build_spider_config(spider_class, config) if hasattr(spider_class, 'setup_extra_args'): parser = ArgumentParser() spider_class.setup_extra_args(parser) extra_args, trash = parser.parse_known_args() spider_config['extra_args'] = vars(extra_args) if thread_number is None: thread_number = spider_config.getint('GRAB_THREAD_NUMBER') stat_task_object = kwargs.get('stat_task_object', None) bot = spider_class( thread_number=thread_number, slave=slave, config=spider_config, network_try_limit=spider_config.getint('GRAB_NETWORK_TRY_LIMIT'), task_try_limit=spider_config.getint('GRAB_TASK_TRY_LIMIT'), ) if spider_config.get('GRAB_QUEUE'): bot.setup_queue(**spider_config['GRAB_QUEUE']) if spider_config.get('GRAB_CACHE'): bot.setup_cache(**spider_config['GRAB_CACHE']) if spider_config.get('GRAB_PROXY_LIST'): if disable_proxy: logger.debug('Proxy servers disabled via command line') else: bot.load_proxylist(**spider_config['GRAB_PROXY_LIST']) if spider_config.get('GRAB_COMMAND_INTERFACES'): for iface_config in spider_config['GRAB_COMMAND_INTERFACES']: bot.controller.add_interface(**iface_config) # Dirty hack # FIXIT: REMOVE bot.dump_spider_stats = kwargs.get('dump_spider_stats') bot.stats_object = kwargs.get('stats_object') try: bot.run() except KeyboardInterrupt: pass stats = bot.render_stats(timing=config.get('GRAB_DISPLAY_TIMING')) if spider_config.get('GRAB_DISPLAY_STATS'): logger.debug(stats) pid = os.getpid() logger.debug('Spider pid is %d' % pid) if config.get('GRAB_SAVE_REPORT'): for subdir in (str(pid), 'last'): dir_ = 'var/%s' % subdir if not os.path.exists(dir_): os.mkdir(dir_) else: clear_directory(dir_) bot.save_list('fatal', '%s/fatal.txt' % dir_) bot.save_list('task-count-rejected', '%s/task_count_rejected.txt' % dir_) bot.save_list('network-count-rejected', '%s/network_count_rejected.txt' % dir_) bot.save_list('task-with-invalid-url', '%s/task_with_invalid_url.txt' % dir_) with open('%s/report.txt' % dir_, 'wb') as out: out.write(stats) return { 'spider_stats': bot.render_stats(timing=False), 'spider_timing': bot.render_timing(), }