def _run_crawler_plain(self, crawler_class, other_options=None): url = self.get_url("/index.html") sys.argv = ['pylinkchecker', "-m", "process", url] if not other_options: other_options = [] sys.argv.extend(other_options) config = Config() config.parse_cli_config() crawler = crawler_class(config, get_logger()) crawler.crawl() return crawler.site
def get_page_crawler(self, url): url = self.get_url(url) url_split = get_clean_url_split(url) input_queue = compat.Queue.Queue() output_queue = compat.Queue.Queue() worker_config = WorkerConfig(username=None, password=None, types=['a', 'img', 'link', 'script'], timeout=5, parser=PARSER_STDLIB) worker_init = WorkerInit(worker_config=worker_config, input_queue=input_queue, output_queue=output_queue, logger=get_logger()) page_crawler = PageCrawler(worker_init) return page_crawler, url_split
def get_page_crawler(self, url): url = self.get_url(url) url_split = get_clean_url_split(url) input_queue = compat.Queue.Queue() output_queue = compat.Queue.Queue() worker_config = WorkerConfig(username=None, password=None, types=['a', 'img', 'link', 'script'], timeout=5, parser=PARSER_STDLIB, strict_mode=False) worker_init = WorkerInit(worker_config=worker_config, input_queue=input_queue, output_queue=output_queue, logger=get_logger()) page_crawler = PageCrawler(worker_init) return page_crawler, url_split