def process_pdf(parser, pdfurl, errors): errors = [] postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) try: parser.fetch_and_preprocess(pdfurl) # except ValueError, e: # errors.append(e) except IndexError, e: errors.append(e)
#parser.debug = True if False: pdfurl = "http://www.stortinget.no/Global/pdf/postjournal/pj-2010-06-04-05.pdf" parse_pdf(pdfurl) exit(0) pdfurls = [] add_pdf_lists(parser, pdfurls) # Fetch all journal PDFs errors = [] for pdfurl in pdfurls: postlistelib.exit_if_no_cpu_left(0, callback=no_cpu_left, arg=errors) try: parser.fetch_and_preprocess(pdfurl) except ValueError, e: errors.append(e) except IndexError, e: errors.append(e) try: parser.process_pages() except ValueError, e: errors.append(e) except IndexError, e: errors.append(e) report_errors(errors) # Based on the scraper advanced-scraping-pdf # See also
#parser.debug = True if False: pdfurl = "http://www.stortinget.no/Global/pdf/postjournal/pj-2010-06-04-05.pdf" parse_pdf(pdfurl) exit(0) pdfurls = [] add_pdf_lists(parser, pdfurls) # Fetch all journal PDFs errors = [] for pdfurl in pdfurls: postlistelib.exit_if_no_cpu_left(0, callback=no_cpu_left, arg = errors) try: parser.fetch_and_preprocess(pdfurl) except ValueError, e: errors.append(e) except IndexError, e: errors.append(e) try: parser.process_pages() except ValueError, e: errors.append(e) except IndexError, e: errors.append(e) report_errors(errors) # Based on the scraper advanced-scraping-pdf # See also