def append_next_page( get_article_func, parsed_urls, page_index, page_url, doc, options ): logging.debug('appending next page: %s' % page_url) if page_index >= MAX_PAGES: return fetcher = options['urlfetch'] try: html = fetcher.urlread(page_url) except Exception as e: logging.warning('exception fetching %s' % page_url, exc_info = True) return orig_page_doc = parse(html, page_url) next_page_url = find_next_page_url(parsed_urls, page_url, orig_page_doc) page_article = get_article_func(orig_page_doc, options) page_doc = fragment_fromstring(page_article.html) make_page_elem(page_index, page_doc) if not is_suspected_duplicate(doc, page_doc): doc.append(page_doc) if next_page_url is not None: append_next_page( get_article_func, parsed_urls, page_index + 1, next_page_url, doc, options )
def append_next_page(get_article_func, parsed_urls, page_index, page_url, doc, options): logging.debug('appending next page: %s' % page_url) if page_index >= MAX_PAGES: return fetcher = options['urlfetch'] try: html = fetcher.urlread(page_url) except Exception as e: logging.warning('exception fetching %s' % page_url, exc_info=True) return orig_page_doc = parse(html, page_url) next_page_url = find_next_page_url(parsed_urls, page_url, orig_page_doc) page_article = get_article_func(orig_page_doc, options) page_doc = fragment_fromstring(page_article.html) make_page_elem(page_index, page_doc) if not is_suspected_duplicate(doc, page_doc): doc.append(page_doc) if next_page_url is not None: append_next_page(get_article_func, parsed_urls, page_index + 1, next_page_url, doc, options)
def _html(self, force=False): if force or self.html is None: self.html = parse(self.input, self.options['url']) return self.html