示例#1
0
        def callback(response, **kwargs):
            stats.on_request(response.request)
            links = [grequests_get(url=url) for url in extract_page_links(response)]

            if links:
                frontier.links_extracted(response.request, links)

            frontier.page_crawled(response)
            print('Crawled', response.url, '(found', len(links), 'urls)')
示例#2
0
 def callback(response, **kwargs):
     stats.on_request(response.request)
     links = [
         grequests_get(url=url) for url in extract_page_links(response)
     ]
     frontier.page_crawled(response=response, links=links)
示例#3
0
    return [urljoin(response.url, link) for link in LINK_RE.findall(response.text)]


"""
The idea is to send requests to each domain with at least 5 seconds of delay. grequests only allows us to limit the
number of simultaneous requests. So, we basically performing checks every frontier iteration and limiting the contents
of new frontier batch by sending overused keys in `info` argument to get_next_requests. Therefore, we're getting to 5
seconds delays per batch.
"""


if __name__ == '__main__':

    frontier = GRequestsFrontierManager(SETTINGS)
    stats = HostnameStatistics()
    frontier.add_seeds([grequests_get(url=url.strip()) for url in SEEDS])

    while True:
        def error_handler(request, exception):
            print('Failed to process request', request.url, 'Error:', exception)
            frontier.request_error(request, str(exception))

        def callback(response, **kwargs):
            stats.on_request(response.request)
            links = [grequests_get(url=url) for url in extract_page_links(response)]

            if links:
                frontier.links_extracted(response.request, links)

            frontier.page_crawled(response)
            print('Crawled', response.url, '(found', len(links), 'urls)')
示例#4
0
 def callback(response, **kwargs):
     stats.on_request(response.request)
     links = [grequests_get(url=url) for url in extract_page_links(response)]
     frontier.page_crawled(response=response, links=links)