Python crawl示例

编程语言: Python

命名空间/包名称: aranea.crawler

方法/功能: crawl

hotexamples.com的示例: 4

Python crawl - 已找到4个示例。这些是从开源项目中提取的最受好评的aranea.crawler.crawl现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

    def test_crawl(self, mock_get_page):
        # NB: this is far more of an integration test than a unit test. Breaking
        # it down into more unit-like elements might be a good idea, partly
        # because it takes a huge time time run compared to the other tests
        # because we crawl the entirety of a documentation dump from the
        # internet.
        file_getter = FileGetter('aiohttp.readthedocs.org',
                                 'aiohttp.readthedocs.org')
        mock_get_page.side_effect = asyncio.coroutine(
            lambda client, url: file_getter.get(url))
        pages = self.loop.run_until_complete(
            crawl(None,
                  'http://aiohttp.readthedocs.org/en/stable/index.html',
                  loop=self.loop))

        # Rudimentary check to see that we've fetched all the pages referenced
        # by other pages:
        all_referenced_urls = set()
        for url, page in pages.items():
            if isinstance(page, Page):
                all_referenced_urls |= page.internal_urls - page.resource_urls
        self.assertEqual(set(pages), all_referenced_urls)

        self.assertIsInstance(
            pages['http://aiohttp.readthedocs.org/en/stable/_modules/aiohttp/'
                  '_multidict.html'], Exception)

示例#2

显示文件

文件： test_crawler.py 项目： ch3pjw/aranea

    def test_crawl(self, mock_get_page):
        # NB: this is far more of an integration test than a unit test. Breaking
        # it down into more unit-like elements might be a good idea, partly
        # because it takes a huge time time run compared to the other tests
        # because we crawl the entirety of a documentation dump from the
        # internet.
        file_getter = FileGetter(
            'aiohttp.readthedocs.org', 'aiohttp.readthedocs.org')
        mock_get_page.side_effect = asyncio.coroutine(
            lambda client, url: file_getter.get(url))
        pages = self.loop.run_until_complete(crawl(
            None, 'http://aiohttp.readthedocs.org/en/stable/index.html',
            loop=self.loop))

        # Rudimentary check to see that we've fetched all the pages referenced
        # by other pages:
        all_referenced_urls = set()
        for url, page in pages.items():
            if isinstance(page, Page):
                all_referenced_urls |= page.internal_urls - page.resource_urls
        self.assertEqual(set(pages), all_referenced_urls)

        self.assertIsInstance(pages[
            'http://aiohttp.readthedocs.org/en/stable/_modules/aiohttp/'
            '_multidict.html'], Exception)

示例#3

显示文件

def main(url, max_concurrent_requests, resources):
    loop = asyncio.get_event_loop()
    client = LimitedClientSession(max_concurrent_requests, loop=loop)
    try:
        crawled_pages = loop.run_until_complete(crawl(client, url, loop=loop))
    finally:
        client.close()
        loop.close()
    return make_graph('url_here', crawled_pages, resources)

示例#4

显示文件

文件： crawl.py 项目： ch3pjw/aranea

def main(url, max_concurrent_requests, resources):
    loop = asyncio.get_event_loop()
    client = LimitedClientSession(max_concurrent_requests, loop=loop)
    try:
        crawled_pages = loop.run_until_complete(
            crawl(client, url, loop=loop))
    finally:
        client.close()
        loop.close()
    return make_graph('url_here', crawled_pages, resources)