Exemplo n.º 1
0
    def _run_crawler_plain(
            self, crawler_class, other_options=None, url="/index.html"):
        url = self.get_url(url)
        sys.argv = ['pylinkvalidator', "-m", "process", url]
        if not other_options:
            other_options = []
        sys.argv.extend(other_options)
        config = Config()
        config.parse_cli_config()

        crawler = crawler_class(config, get_logger())
        crawler.crawl()

        if config.options.multi:
            crawler.site.collect_multi_sites()

        return crawler.site
Exemplo n.º 2
0
    def _run_crawler_plain(
            self, crawler_class, other_options=None, url="/index.html"):
        url = self.get_url(url)
        sys.argv = ['pylinkvalidator', "-m", "process", url]
        if not other_options:
            other_options = []
        sys.argv.extend(other_options)
        config = Config()
        config.parse_cli_config()

        crawler = crawler_class(config, get_logger())
        crawler.crawl()

        if config.options.multi:
            crawler.site.collect_multi_sites()

        return crawler.site
Exemplo n.º 3
0
    def get_page_crawler(self, url):
        url = self.get_url(url)
        url_split = get_clean_url_split(url)
        input_queue = compat.Queue.Queue()
        output_queue = compat.Queue.Queue()

        worker_config = WorkerConfig(
            username=None, password=None, types=['a', 'img', 'link', 'script'],
            timeout=5, parser=PARSER_STDLIB,
            strict_mode=False, prefer_server_encoding=False,
            extra_headers=[])

        worker_init = WorkerInit(
            worker_config=worker_config,
            input_queue=input_queue, output_queue=output_queue,
            logger=get_logger())

        page_crawler = PageCrawler(worker_init)

        return page_crawler, url_split
Exemplo n.º 4
0
    def get_page_crawler(self, url):
        url = self.get_url(url)
        url_split = get_clean_url_split(url)
        input_queue = compat.Queue.Queue()
        output_queue = compat.Queue.Queue()

        worker_config = WorkerConfig(
            username=None, password=None, types=['a', 'img', 'link', 'script'],
            timeout=5, parser=PARSER_STDLIB,
            strict_mode=False, prefer_server_encoding=False,
            extra_headers=[])

        worker_init = WorkerInit(
            worker_config=worker_config,
            input_queue=input_queue, output_queue=output_queue,
            logger=get_logger())

        page_crawler = PageCrawler(worker_init)

        return page_crawler, url_split
Exemplo n.º 5
0
    def test_url_file_path(self):
        (_, temp_file_path) = mkstemp()
        url = self.get_url("/index.html")
        url2 = self.get_url("/robots.txt")
        with open(temp_file_path, "w") as temp_file:
            temp_file.write(url + "\n")
            temp_file.write(url2 + "\n")

        sys.argv = [
            "pylinkvalidator", "-m", "process", "--url-file-path",
            temp_file_path]
        config = Config()
        config.parse_cli_config()

        crawler = ThreadSiteCrawler(config, get_logger())
        crawler.crawl()

        site = crawler.site
        self.assertEqual(12, len(site.pages))
        self.assertEqual(1, len(site.error_pages))
        os.unlink(temp_file_path)
Exemplo n.º 6
0
    def test_url_file_path(self):
        (_, temp_file_path) = mkstemp()
        url = self.get_url("/index.html")
        url2 = self.get_url("/robots.txt")
        with open(temp_file_path, "w") as temp_file:
            temp_file.write(url + "\n")
            temp_file.write(url2 + "\n")

        sys.argv = [
            "pylinkvalidator", "-m", "process", "--url-file-path",
            temp_file_path]
        config = Config()
        config.parse_cli_config()

        crawler = ThreadSiteCrawler(config, get_logger())
        crawler.crawl()

        site = crawler.site
        self.assertEqual(12, len(site.pages))
        self.assertEqual(1, len(site.error_pages))
        os.unlink(temp_file_path)