def test_crawl(self, mock_get_page): # NB: this is far more of an integration test than a unit test. Breaking # it down into more unit-like elements might be a good idea, partly # because it takes a huge time time run compared to the other tests # because we crawl the entirety of a documentation dump from the # internet. file_getter = FileGetter('aiohttp.readthedocs.org', 'aiohttp.readthedocs.org') mock_get_page.side_effect = asyncio.coroutine( lambda client, url: file_getter.get(url)) pages = self.loop.run_until_complete( crawl(None, 'http://aiohttp.readthedocs.org/en/stable/index.html', loop=self.loop)) # Rudimentary check to see that we've fetched all the pages referenced # by other pages: all_referenced_urls = set() for url, page in pages.items(): if isinstance(page, Page): all_referenced_urls |= page.internal_urls - page.resource_urls self.assertEqual(set(pages), all_referenced_urls) self.assertIsInstance( pages['http://aiohttp.readthedocs.org/en/stable/_modules/aiohttp/' '_multidict.html'], Exception)
def test_crawl(self, mock_get_page): # NB: this is far more of an integration test than a unit test. Breaking # it down into more unit-like elements might be a good idea, partly # because it takes a huge time time run compared to the other tests # because we crawl the entirety of a documentation dump from the # internet. file_getter = FileGetter( 'aiohttp.readthedocs.org', 'aiohttp.readthedocs.org') mock_get_page.side_effect = asyncio.coroutine( lambda client, url: file_getter.get(url)) pages = self.loop.run_until_complete(crawl( None, 'http://aiohttp.readthedocs.org/en/stable/index.html', loop=self.loop)) # Rudimentary check to see that we've fetched all the pages referenced # by other pages: all_referenced_urls = set() for url, page in pages.items(): if isinstance(page, Page): all_referenced_urls |= page.internal_urls - page.resource_urls self.assertEqual(set(pages), all_referenced_urls) self.assertIsInstance(pages[ 'http://aiohttp.readthedocs.org/en/stable/_modules/aiohttp/' '_multidict.html'], Exception)
def main(url, max_concurrent_requests, resources): loop = asyncio.get_event_loop() client = LimitedClientSession(max_concurrent_requests, loop=loop) try: crawled_pages = loop.run_until_complete(crawl(client, url, loop=loop)) finally: client.close() loop.close() return make_graph('url_here', crawled_pages, resources)
def main(url, max_concurrent_requests, resources): loop = asyncio.get_event_loop() client = LimitedClientSession(max_concurrent_requests, loop=loop) try: crawled_pages = loop.run_until_complete( crawl(client, url, loop=loop)) finally: client.close() loop.close() return make_graph('url_here', crawled_pages, resources)