Пример #1
0
    def _run_crawler_plain(self, crawler_class, other_options=None):
        url = self.get_url("/index.html")
        sys.argv = ['pylinkchecker', "-m", "process", url]
        if not other_options:
            other_options = []
        sys.argv.extend(other_options)
        config = Config()
        config.parse_cli_config()

        crawler = crawler_class(config, get_logger())
        crawler.crawl()

        return crawler.site
Пример #2
0
    def _run_crawler_plain(self, crawler_class, other_options=None):
        url = self.get_url("/index.html")
        sys.argv = ['pylinkchecker', "-m", "process", url]
        if not other_options:
            other_options = []
        sys.argv.extend(other_options)
        config = Config()
        config.parse_cli_config()

        crawler = crawler_class(config, get_logger())
        crawler.crawl()

        return crawler.site
Пример #3
0
    def get_page_crawler(self, url):
        url = self.get_url(url)
        url_split = get_clean_url_split(url)
        input_queue = compat.Queue.Queue()
        output_queue = compat.Queue.Queue()

        worker_config = WorkerConfig(username=None, password=None, types=['a',
                'img', 'link', 'script'], timeout=5, parser=PARSER_STDLIB)

        worker_init = WorkerInit(worker_config=worker_config,
                input_queue=input_queue, output_queue=output_queue,
                logger=get_logger())

        page_crawler = PageCrawler(worker_init)

        return page_crawler, url_split
Пример #4
0
    def get_page_crawler(self, url):
        url = self.get_url(url)
        url_split = get_clean_url_split(url)
        input_queue = compat.Queue.Queue()
        output_queue = compat.Queue.Queue()

        worker_config = WorkerConfig(username=None, password=None, types=['a',
                'img', 'link', 'script'], timeout=5, parser=PARSER_STDLIB,
                strict_mode=False)

        worker_init = WorkerInit(worker_config=worker_config,
                input_queue=input_queue, output_queue=output_queue,
                logger=get_logger())

        page_crawler = PageCrawler(worker_init)

        return page_crawler, url_split