示例#1
0
def process_page_queue(parser, errors):
    try:
        if False:
            scraperwiki.sqlite.execute("delete from swdata where scrapedurl in (select scrapedurl from unparsedpages)")
            scraperwiki.sqlite.execute("delete from unparsedpages")
            scraperwiki.sqlite.commit()

        parser.process_pages()
        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
    except scraperwiki.CPUTimeExceededError, e:
        errors.append("Processing pages interrupted")
示例#2
0
def process_page_queue(parser, errors):
    try:
        parser.process_pages()
        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
    except scraperwiki.CPUTimeExceededError, e:
        errors.append("Processing pages interrupted")
def process_page_queue(parser, errors):
    try:
        parser.process_pages()
        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
    except scraperwiki.CPUTimeExceededError, e:
        errors.append("Processing pages interrupted")
示例#4
0
pdfurls = []
add_pdf_lists(parser, pdfurls)

# Fetch all journal PDFs
errors = []
for pdfurl in pdfurls:
    postlistelib.exit_if_no_cpu_left(0, callback=no_cpu_left, arg=errors)
    try:
        parser.fetch_and_preprocess(pdfurl)
    except ValueError, e:
        errors.append(e)
    except IndexError, e:
        errors.append(e)
try:
    parser.process_pages()
except ValueError, e:
    errors.append(e)
except IndexError, e:
    errors.append(e)

report_errors(errors)

# Based on the scraper advanced-scraping-pdf
# See also
# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf
import scraperwiki
from BeautifulSoup import BeautifulSoup
import datetime
import dateutil.parser
import lxml.html
pdfurls = []
add_pdf_lists(parser, pdfurls)

# Fetch all journal PDFs
errors = []
for pdfurl in pdfurls:
    postlistelib.exit_if_no_cpu_left(0, callback=no_cpu_left, arg = errors)
    try:
        parser.fetch_and_preprocess(pdfurl)
    except ValueError, e:
        errors.append(e)
    except IndexError, e:
        errors.append(e)
try:
    parser.process_pages()
except ValueError, e:
    errors.append(e)
except IndexError, e:
    errors.append(e)

report_errors(errors)

# Based on the scraper advanced-scraping-pdf
# See also 
# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf
import scraperwiki
from BeautifulSoup import BeautifulSoup
import datetime
import dateutil.parser
import lxml.html