def page_recvd((url, text)): try: spider.add_page(url, text) except SpiderIsFull: if not defer_fired: defer_fired.append(None) pages_d.callback(spider.get_crawled()) return urls = spider.give_all_jobs() # draw a point sys.stdout.write('.') sys.stdout.flush() for url in urls: next_d = get_page(str(url), enc=encoding, timeout=timeout) next_d.addCallback(page_recvd) next_d.addErrback(on_err)
def crawl_pages(start_page, url_matcher, encoding, timeout, max_num, parser): pages_d = defer.Deferred() spider = PageSpider(url_matcher, max_num) spider.parser = parser def on_err(errobj): evalue = errobj.value if isinstance(evalue, tuple) and len(evalue) == 2: url, reason = evalue spider.fail_page(url) defer_fired = [] def page_recvd((url, text)): try: spider.add_page(url, text) except SpiderIsFull: if not defer_fired: defer_fired.append(None) pages_d.callback(spider.get_crawled()) return urls = spider.give_all_jobs() # draw a point sys.stdout.write('.') sys.stdout.flush() for url in urls: next_d = get_page(str(url), enc=encoding, timeout=timeout) next_d.addCallback(page_recvd) next_d.addErrback(on_err) # initial crawl d = get_page(start_page, enc=encoding, timeout=timeout, must_succ=True) d.addCallback(page_recvd) d.addErrback(on_err) return pages_d # defer that will callback with pages