示例#1
0
def return_error_pages(site_links=[], config={}):
    error_items = []
    crawled_site = api.crawl_with_options(
        site_links,
        config,
    )
    error_pages = crawled_site.error_pages
    for item in error_pages:
        raw = error_pages[item]
        sources = raw.sources
        sources_data = []
        for source in sources:
            source_url = source.origin.geturl()
            source_html = source.origin_str
            source_data = {
                'source_url': source_url,
                'source_html': source_html,
            }
            sources_data.append(source_data)
        item_url = raw.url_split.geturl()
        item_status = raw.status
        item_status_message = raw.get_status_message()
        data = {
            'sources': len(sources),
            'sources_data': sources_data,
            'item_url': item_url,
            'item_status': item_status,
            'item_status_message': item_status_message,
        }
        error_items.append(data)
    return error_items
示例#2
0
    def test_api_with_options(self):
        url = self.get_url("/index.html")

        site = api.crawl_with_options([url], {"run-once": True, "workers": 2})
        self.assertEqual(8, len(site.pages))
        self.assertEqual(0, len(site.error_pages))
示例#3
0
    def test_api_with_options(self):
        url = self.get_url("/index.html")

        site = api.crawl_with_options([url], {"run-once": True, "workers": 2})
        self.assertEqual(8, len(site.pages))
        self.assertEqual(0, len(site.error_pages))
import os

PORT = 8080

Handler = http.server.SimpleHTTPRequestHandler
dir = "docpage-source/my-project/site"
time = "10"
os.chdir(dir)
with socketserver.TCPServer(("", PORT), Handler) as httpd:
    print("serving at port", PORT)

    httpd.serve_forever()
    crawled_site = crawl_with_options(
        ["http://127.0.0.1:8080"],
        {
            "workers": 10,
            "test-outside": True,
            "progress": True
        },
    )
    number_of_crawled_pages = len(crawled_site.pages)
    number_of_errors = len(crawled_site.error_pages)

    print(number_of_crawled_pages)

print('hello world')

crawled_site = crawl_with_options(
    ["https://soo-underground.github.io"],
    {
        "workers": 10,
        "test-outside": True,