def return_error_pages(site_links=[], config={}): error_items = [] crawled_site = api.crawl_with_options( site_links, config, ) error_pages = crawled_site.error_pages for item in error_pages: raw = error_pages[item] sources = raw.sources sources_data = [] for source in sources: source_url = source.origin.geturl() source_html = source.origin_str source_data = { 'source_url': source_url, 'source_html': source_html, } sources_data.append(source_data) item_url = raw.url_split.geturl() item_status = raw.status item_status_message = raw.get_status_message() data = { 'sources': len(sources), 'sources_data': sources_data, 'item_url': item_url, 'item_status': item_status, 'item_status_message': item_status_message, } error_items.append(data) return error_items
def test_api_with_options(self): url = self.get_url("/index.html") site = api.crawl_with_options([url], {"run-once": True, "workers": 2}) self.assertEqual(8, len(site.pages)) self.assertEqual(0, len(site.error_pages))
import os PORT = 8080 Handler = http.server.SimpleHTTPRequestHandler dir = "docpage-source/my-project/site" time = "10" os.chdir(dir) with socketserver.TCPServer(("", PORT), Handler) as httpd: print("serving at port", PORT) httpd.serve_forever() crawled_site = crawl_with_options( ["http://127.0.0.1:8080"], { "workers": 10, "test-outside": True, "progress": True }, ) number_of_crawled_pages = len(crawled_site.pages) number_of_errors = len(crawled_site.error_pages) print(number_of_crawled_pages) print('hello world') crawled_site = crawl_with_options( ["https://soo-underground.github.io"], { "workers": 10, "test-outside": True,