def process_page_queue(parser, errors): try: if False: scraperwiki.sqlite.execute("delete from swdata where scrapedurl in (select scrapedurl from unparsedpages)") scraperwiki.sqlite.execute("delete from unparsedpages") scraperwiki.sqlite.commit() parser.process_pages() postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) except scraperwiki.CPUTimeExceededError, e: errors.append("Processing pages interrupted")
def process_page_queue(parser, errors): try: parser.process_pages() postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) except scraperwiki.CPUTimeExceededError, e: errors.append("Processing pages interrupted")
pdfurls = [] add_pdf_lists(parser, pdfurls) # Fetch all journal PDFs errors = [] for pdfurl in pdfurls: postlistelib.exit_if_no_cpu_left(0, callback=no_cpu_left, arg=errors) try: parser.fetch_and_preprocess(pdfurl) except ValueError, e: errors.append(e) except IndexError, e: errors.append(e) try: parser.process_pages() except ValueError, e: errors.append(e) except IndexError, e: errors.append(e) report_errors(errors) # Based on the scraper advanced-scraping-pdf # See also # https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf import scraperwiki from BeautifulSoup import BeautifulSoup import datetime import dateutil.parser import lxml.html
pdfurls = [] add_pdf_lists(parser, pdfurls) # Fetch all journal PDFs errors = [] for pdfurl in pdfurls: postlistelib.exit_if_no_cpu_left(0, callback=no_cpu_left, arg = errors) try: parser.fetch_and_preprocess(pdfurl) except ValueError, e: errors.append(e) except IndexError, e: errors.append(e) try: parser.process_pages() except ValueError, e: errors.append(e) except IndexError, e: errors.append(e) report_errors(errors) # Based on the scraper advanced-scraping-pdf # See also # https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf import scraperwiki from BeautifulSoup import BeautifulSoup import datetime import dateutil.parser import lxml.html