def test_explicit_template_name(self): old_dir = os.getcwd() os.chdir(TMP_DIR) test_dir = os.path.dirname(__file__) project_sample_path = os.path.join(test_dir, 'files/project_sample') start_project.main(project_name='test_project', template=project_sample_path) self.assertTrue(os.path.join(TMP_DIR, 'test_project/foo.py')) os.chdir(old_dir) dir_ = os.path.join(TMP_DIR, 'test_project') clear_directory(dir_) os.rmdir(dir_)
def test_start_project(self): old_dir = os.getcwd() os.chdir(TMP_DIR) dir_ = os.path.join(TMP_DIR, 'test_project') start_project.main(project_name='test_project', template=None) os.chdir(old_dir) self.assertTrue( os.path.exists(os.path.join(TMP_DIR, 'test_project/var'))) self.assertTrue( os.path.exists(os.path.join(TMP_DIR, 'test_project/var/log'))) self.assertTrue( os.path.exists(os.path.join(TMP_DIR, 'test_project/var/run'))) path = os.path.join(TMP_DIR, 'test_project/spider.py') self.assertTrue('TestProjectSpider' in open(path).read()) clear_directory(dir_) os.rmdir(dir_)
def test_start_project(self): old_dir = os.getcwd() os.chdir(TMP_DIR) dir_ = os.path.join(TMP_DIR, 'test_project') start_project.main(project_name='test_project', template=None) os.chdir(old_dir) self.assertTrue(os.path.exists(os.path.join(TMP_DIR, 'test_project/var'))) self.assertTrue(os.path.exists(os.path.join(TMP_DIR, 'test_project/var/log'))) self.assertTrue(os.path.exists(os.path.join(TMP_DIR, 'test_project/var/run'))) path = os.path.join(TMP_DIR, 'test_project/spider.py') self.assertTrue( 'TestProjectSpider' in open(path).read()) clear_directory(dir_) os.rmdir(dir_)
def main(spider_name, thread_number=None, settings_module='settings', network_logs=False, disable_proxy=False, ignore_lock=False, disable_report=False, api_port=None, parser_pool_size=2, grab_log_file=None, network_log_file=None, network_service=None, grab_transport=None, **kwargs): # pylint: disable=unused-argument default_logging( grab_log=grab_log_file, network_log=network_log_file, propagate_network_logger=network_logs, ) root_config = build_root_config(settings_module) spider_class = load_spider_class(root_config, spider_name) spider_config = build_spider_config(spider_class, root_config) spider_args = None if hasattr(spider_class, 'setup_arg_parser'): parser = ArgumentParser() spider_class.setup_arg_parser(parser) opts, _ = parser.parse_known_args() spider_args = vars(opts) bot = spider_class( thread_number=thread_number, config=spider_config, network_try_limit=None, task_try_limit=None, args=spider_args, http_api_port=api_port, parser_pool_size=parser_pool_size, network_service=network_service, grab_transport=grab_transport, ) opt_queue = spider_config.get('queue') if opt_queue: bot.setup_queue(**opt_queue) opt_cache = spider_config.get('cache') if opt_cache: bot.setup_cache(**opt_cache) opt_proxy_list = spider_config.get('proxy_list') if opt_proxy_list: if disable_proxy: logger.debug('Proxy servers disabled via command line') else: bot.load_proxylist(**opt_proxy_list) opt_ifaces = spider_config.get('command_interfaces') if opt_ifaces: for iface_config in opt_ifaces: bot.controller.add_interface(**iface_config) try: bot.run() except KeyboardInterrupt: pass stats = bot.render_stats() if spider_config.get('display_stats'): logger.debug(stats) pid = os.getpid() logger.debug('Spider pid is %d', pid) if not disable_report: if spider_config.get('save_report'): for subdir in (str(pid), 'last'): dir_ = 'var/%s' % subdir if not os.path.exists(dir_): os.makedirs(dir_) else: clear_directory(dir_) for key, lst in bot.stat.collections.items(): fname_key = key.replace('-', '_') save_list(lst, '%s/%s.txt' % (dir_, fname_key)) with open('%s/report.txt' % dir_, 'wb') as out: out.write(make_str(stats)) return { 'spider_stats': bot.render_stats(), }
def main(spider_name, thread_number=None, slave=False, settings_module='settings', network_logs=False, disable_proxy=False, ignore_lock=False, disable_report=False, disable_default_logs=False, *args, **kwargs): if disable_default_logs: default_logging(propagate_network_logger=network_logs, grab_log=None, network_log=None) else: default_logging(propagate_network_logger=network_logs) root_config = build_root_config(settings_module) spider_class = load_spider_class(root_config, spider_name) spider_config = build_spider_config(spider_class, root_config) spider_args = None if hasattr(spider_class, 'setup_arg_parser'): parser = ArgumentParser() spider_class.setup_arg_parser(parser) opts, trash = parser.parse_known_args() spider_args = vars(opts) bot = spider_class( thread_number=thread_number, slave=slave, config=spider_config, network_try_limit=None, task_try_limit=None, args=spider_args, ) opt_queue = spider_config.get('queue') if opt_queue: bot.setup_queue(**opt_queue) opt_cache = spider_config.get('cache') if opt_cache: bot.setup_cache(**opt_cache) opt_proxy_list = spider_config.get('proxy_list') if opt_proxy_list: if disable_proxy: logger.debug('Proxy servers disabled via command line') else: bot.load_proxylist(**opt_proxy_list) opt_ifaces = spider_config.get('command_interfaces') if opt_ifaces: for iface_config in opt_ifaces: bot.controller.add_interface(**iface_config) try: bot.run() except KeyboardInterrupt: pass stats = bot.render_stats(timing=spider_config.get('display_timing')) stats_with_time = bot.render_stats(timing=True) if spider_config.get('display_stats'): logger.debug(stats) pid = os.getpid() logger.debug('Spider pid is %d' % pid) if not disable_report: if spider_config.get('save_report'): for subdir in (str(pid), 'last'): dir_ = 'var/%s' % subdir if not os.path.exists(dir_): os.mkdir(dir_) else: clear_directory(dir_) for key, lst in bot.items.items(): fname_key = key.replace('-', '_') bot.save_list(key, '%s/%s.txt' % (dir_, fname_key)) with open('%s/report.txt' % dir_, 'wb') as out: out.write(make_str(stats_with_time)) return { 'spider_stats': bot.render_stats(timing=False), 'spider_timing': bot.render_timing(), }