Пример #1
0
def test_small_pdfs(parser):
    # Test with some smaller PDFs
    errors = []
    #parser.debug = True
    newurl = "http://www.saltdal.kommune.no/images/module.files/010612.pdf"
    if not parser.is_already_scraped(newurl):
        process_pdf(parser, newurl, errors) # New format
    if parser.is_already_scraped(newurl):
        print "Already parsed"
    else:
        raise ValueError("Failed to parse")
#    process_pdf(parser, "http://www.saltdal.kommune.no/images/module.files/2007-01-31.pdf", errors) # Old format
    process_page_queue(parser, errors)
    report_errors(errors)
    exit(0)
def process_journal_pdfs(parser, listurl, errors):
    #    print "Finding PDFs on " + listurl
    #    u = urllib.parse.urlparse(listurl)
    # http://www.ostfold-f.kommune.no/offjournal_show_index.asp?tmpSearch=1
    for week in range(1, 53):
        params = urllib.urlencode({"tmpSearch": str(week), "btnSubmit": "Vis", "strHandling": "uke"})

        headers = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/html"}
        conn = httplib.HTTPConnection("www.ostfold-f.kommune.no:80")
        conn.request("POST", "/offjournal_show_index.asp", params, headers)
        response = conn.getresponse()
        html = response.read()
        conn.close()
        root = lxml.html.fromstring(html)
        html = None
        for ahref in root.cssselect("table a"):
            href = ahref.attrib["href"]
            url = urlparse.urljoin(listurl, href)
            if -1 != href.find("file://") or -1 == url.find(".pdf"):
                #            print "Skipping non-http URL " + url
                continue
            if parser.is_already_scraped(url):
                True
            #            print "Skipping already scraped " + url
            else:
                #            print "Will process " + url
                process_pdf(parser, url, errors)
def process_journal_pdfs(parser, listurl, errors):
    #    print "Finding PDFs on " + listurl
    #    u = urllib.parse.urlparse(listurl)
    html = scraperwiki.scrape(listurl)
    root = lxml.html.fromstring(html)
    html = None
    for ahref in root.cssselect("table a"):
        href = ahref.attrib['href']
        url = urlparse.urljoin(listurl, href)
        if -1 != href.find("file://") or -1 == url.find("/postliste-"):
            #            print "Skipping non-http URL " + url
            continue
        if parser.is_already_scraped(url):
            True


#            print "Skipping already scraped " + url
        else:
            #            print "Will process " + url
            process_pdf(parser, url, errors)
    # Follow the "next page" link to the end
    for ahref in root.cssselect("center a"):
        href = ahref.attrib['href']
        url = urlparse.urljoin(listurl, href)
        print ahref.text, url
        if -1 != ahref.text.find("Neste side"):
            process_journal_pdfs(parser, url, errors)
Пример #4
0
def process_journal_pdfs(parser, listurl, errors):
#    print "Finding PDFs on " + listurl
#    u = urllib.parse.urlparse(listurl)
    html = scraperwiki.scrape(listurl)
    root = lxml.html.fromstring(html)
    html = None
    for ahref in root.cssselect("table#hovedinnhold a"):
        href = ahref.attrib['href']
        url = urlparse.urljoin(listurl, href).replace(" ", "+")
        if -1 != href.find("file://"):
#            print "Skipping non-http URL " + url
            continue
        if -1 == url.find(".pdf"):
            continue
        # Special case, file indicating no journal entries this day
        if "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/CA6C83764E56DDCBC1257A02003F9025/$FILE/Postjournal+11.05.12.pdf" == url or \
            "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/7FD82A18C1A1F137C12579F90029DEBD/$FILE/Postjournal+07.05.12.pdf" == url or \
            "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/777B497BB48936ACC1257A450033E1D4/$FILE/Postjournal+20.07.12.pdf" == url or \
            "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/1802A0FF57C08EFEC1257A4500337345/$FILE/Postjournal+16.07.12.pdf" == url or \
            "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/90373A38701C27E5C1257A45002F63FD/$FILE/Postjournal+12.07.12.pdf" == url or \
            "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/6B00A3BD92B3C2AEC1257A45002F4044/$FILE/Postjournal+10.07.12.pdf" == url or \
            "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/0141B5488D38B8FEC1257A44003756ED/$FILE/Postjournal+06.07.12.pdf" == url:
            continue
        if parser.is_already_scraped(url):
            True
#            print "Skipping already scraped " + url
        else:
#            print "Will process " + url
            process_pdf(parser, url, errors)
def process_journal_pdfs(parser, listurl, errors):
#    print "Finding PDFs on " + listurl
#    u = urllib.parse.urlparse(listurl)
    html = scraperwiki.scrape(listurl)
    root = lxml.html.fromstring(html)
    html = None
    for ahref in root.cssselect("div#placeholder-content-main-left-column a"):
        href = ahref.attrib['href']
        url = urlparse.urljoin(listurl, href)
        if -1 != href.find("file://") or -1 == url.find('/postjournal/article'):
#            print "Skipping non-http URL " + url
            continue
        subhtml = scraperwiki.scrape(url)
        subroot = lxml.html.fromstring(subhtml)
        subhtml = None
        for subahref in subroot.cssselect("div.article-content a"):
            subhref = subahref.attrib['href']
            suburl = urlparse.urljoin(listurl, subhref)
            if -1 == suburl.find(".pdf"):
                continue
            if parser.is_already_scraped(suburl):
                True
#            print "Skipping already scraped " + suburl
            else:
#                print "Will process " + suburl
                process_pdf(parser, suburl, errors)
def process_journal_pdf_directory(parser, listurl, errors):
    #html = scraperwiki.scrape(listurl)
    html = lazycache.lazycache(listurl)
    root = lxml.html.fromstring(html)
    html = None

    pdflisturls = []
    for ahref in root.cssselect("span.vrtx-paging-wrapper a"):
        href = ahref.attrib['href']
        url = urlparse.urljoin(listurl, href)
        pdflisturls.append(url)
#    print pdflisturls

    for listurl in pdflisturls:
        html = scraperwiki.scrape(listurl)
        root = lxml.html.fromstring(html)
        html = None
        urlseen = {}
        for ahref in root.cssselect("div.vrtx-resource a"):
            href = ahref.attrib['href']
            url = urlparse.urljoin(listurl, href)
            if -1 == url.find(".pdf"):
                continue
            # Ignore duplicates with M: as part of the name
            if -1 != url.find("/M%"):
                continue
            if url in urlseen or parser.is_already_scraped(url):
                True
#                print "Skipping already scraped " + url
            else:
#                print "Will process " + url
                process_pdf(parser, url, errors)
            urlseen[url] = 1
Пример #7
0
def process_journal_pdfs(parser, listurl, errors):
#    print "Finding PDFs on " + listurl
#    u = urllib.parse.urlparse(listurl)
    html = scraperwiki.scrape(listurl)
    root = lxml.html.fromstring(html)
    html = None
    for ahref in root.cssselect("div.month-entry-title a"):
        href = ahref.attrib['href']
        url = urlparse.urljoin(listurl, href)
        print url
        if -1 != href.find("file://"):
#            print "Skipping non-http URL " + url
            continue
        subhtml = scraperwiki.scrape(url)
        subroot = lxml.html.fromstring(subhtml)
        subhtml = None
        for subahref in subroot.cssselect("div.related-attachements a"):
            subhref = subahref.attrib['href']
            suburl = urlparse.urljoin(url, subhref)
            if -1 == suburl.find(".pdf"):
                continue
            if parser.is_already_scraped(suburl):
                True
#                print "Skipping already scraped " + url
            else:
#                print "Will process " + url
                process_pdf(parser, suburl, errors)
def process_journal_pdfs(parser, listurl, errors):
    #    print "Finding PDFs on " + listurl
    #    u = urllib.parse.urlparse(listurl)
    html = scraperwiki.scrape(listurl)
    root = lxml.html.fromstring(html)
    html = None
    for ahref in root.cssselect("table a"):
        href = ahref.attrib["href"]
        url = urlparse.urljoin(listurl, href)
        if -1 != href.find("file://") or -1 == url.find("/postliste-"):
            #            print "Skipping non-http URL " + url
            continue
        if parser.is_already_scraped(url):
            True
        #            print "Skipping already scraped " + url
        else:
            #            print "Will process " + url
            process_pdf(parser, url, errors)
    # Follow the "next page" link to the end
    for ahref in root.cssselect("center a"):
        href = ahref.attrib["href"]
        url = urlparse.urljoin(listurl, href)
        print ahref.text, url
        if -1 != ahref.text.find("Neste side"):
            process_journal_pdfs(parser, url, errors)
def process_journal_pdf_directory(parser, listurl, errors):
    #html = scraperwiki.scrape(listurl)
    html = lazycache.lazycache(listurl)
    root = lxml.html.fromstring(html)
    html = None

    pdflisturls = []
    for ahref in root.cssselect("span.vrtx-paging-wrapper a"):
        href = ahref.attrib['href']
        url = urlparse.urljoin(listurl, href)
        pdflisturls.append(url)
#    print pdflisturls

    for listurl in pdflisturls:
        html = scraperwiki.scrape(listurl)
        root = lxml.html.fromstring(html)
        html = None
        urlseen = {}
        for ahref in root.cssselect("div.vrtx-resource a"):
            href = ahref.attrib['href']
            url = urlparse.urljoin(listurl, href)
            if -1 == url.find(".pdf"):
                continue
            # Ignore duplicates with M: as part of the name
            if -1 != url.find("/M%"):
                continue
            if url in urlseen or parser.is_already_scraped(url):
                True


#                print "Skipping already scraped " + url
            else:
                #                print "Will process " + url
                process_pdf(parser, url, errors)
            urlseen[url] = 1
def consider_url(parser, url, errors):
    if parser.is_already_scraped(url):
        True
#            print "Skipping already scraped " + url
    else:
#            print "Will process " + url
        try:
            process_pdf(parser, url, errors)
        except:
            pass
def process_pdf(parser, pdfurl, errors):
    if parser.is_already_scraped(pdfurl):
        return
    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
    try:
        pdfcontent = scraperwiki.scrape(pdfurl)
        parser.preprocess(pdfurl, pdfcontent)
        pdfcontent = None
    except ValueError, e:
        print e
        errors.append(e)
Пример #12
0
def process_pdf(parser, pdfurl, errors):
    if parser.is_already_scraped(pdfurl):
        return
    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
    try:
        pdfcontent = scraperwiki.scrape(pdfurl)
        parser.preprocess(pdfurl, pdfcontent)
        pdfcontent = None
    except ValueError, e:
        print e
        errors.append(e)
def process_journal_pdfs(parser, listurl, errors):
    print "Finding PDFs on " + listurl
    html = scraperwiki.scrape(listurl)
    root = lxml.html.fromstring(html)
    html = None
    for ahref in root.cssselect("div#placeholder-content td.name a"):
        href = ahref.attrib["href"]
        url = urlparse.urljoin(listurl, href).replace(" ", "+")
        if -1 != href.find("file://") or -1 == url.find(".pdf"):
            continue
        if parser.is_already_scraped(url):
            continue
        else:
            process_pdf(parser, url, errors)
Пример #14
0
def process_journal_pdfs(parser, listurl, errors):
    print "Finding PDFs on " + listurl
    html = scraperwiki.scrape(listurl)
    root = lxml.html.fromstring(html)
    html = None
    for ahref in root.cssselect("div#placeholder-content td.name a"):
        href = ahref.attrib['href']
        url = urlparse.urljoin(listurl, href).replace(" ", "+")
        if -1 != href.find("file://") or -1 == url.find(".pdf"):
            continue
        if parser.is_already_scraped(url):
            continue
        else:
            process_pdf(parser, url, errors)
def process_journal_pdfs(parser, listurl, errors):
    print "Finding PDFs on " + listurl
#    u = urllib.parse.urlparse(listurl)
    xml = scraperwiki.scrape(listurl)
    root = lxml.html.fromstring(xml)
    xml = None
    for link in root.cssselect("hendelse link"):
        url = lxml.html.tostring(link).replace("<link>", "").strip()
        #print url
        if parser.is_already_scraped(url):
            True
#            print "Skipping already scraped " + url
        else:
#            print "Will process " + url
            process_pdf(parser, url, errors)
Пример #16
0
def process_journal_pdfs(parser, listurl, errors):
    print "Finding PDFs on " + listurl
    html = scraperwiki.scrape(listurl)
    root = lxml.html.fromstring(html)
    html = None
    for ahref in root.cssselect("div#Hovedspalte div.tekst a.pdf"):
        href = ahref.attrib['href']
        url = urlparse.urljoin(listurl, href).replace(" ", "+")
        #if -1 != href.find("file://") or -1 == url.find(".pdf"): # we select pdf based on css class
        #    print "ignoring %s" % url
        #    continue
        if parser.is_already_scraped(url):
            continue
        else:
            #print url
            process_pdf(parser, url, errors)
def process_journal_pdfs(parser, listurl, errors):
    print "Finding PDFs on " + listurl
    html = scraperwiki.scrape(listurl)
    root = lxml.html.fromstring(html)
    html = None
    for ahref in root.cssselect("div#Hovedspalte div.tekst a.pdf"):
        href = ahref.attrib['href']
        url = urlparse.urljoin(listurl, href).replace(" ", "+")
        #if -1 != href.find("file://") or -1 == url.find(".pdf"): # we select pdf based on css class
        #    print "ignoring %s" % url
        #    continue
        if parser.is_already_scraped(url):
            continue
        else:
            #print url
            process_pdf(parser, url, errors)
def process_journal_pdfs(parser, listurl, errors):
#    print "Finding PDFs on " + listurl
#    u = urllib.parse.urlparse(listurl)
    html = scraperwiki.scrape(listurl)
    root = lxml.html.fromstring(html)
    html = None
    for ahref in root.cssselect("div.main a"):
        href = ahref.attrib['href']
        url = urlparse.urljoin(listurl, href)
        if -1 == url.find("postliste-"):
            continue
        if parser.is_already_scraped(url):
            True
#            print "Skipping already scraped " + url
        else:
#            print "Will process " + url
            process_pdf(parser, url, errors)
Пример #19
0
def process_journal_pdfs(parser, listurl, errors):
    print "Finding PDFs on " + listurl
    #    u = urllib.parse.urlparse(listurl)
    xml = scraperwiki.scrape(listurl)
    root = lxml.html.fromstring(xml)
    xml = None
    for link in root.cssselect("hendelse link"):
        url = lxml.html.tostring(link).replace("<link>", "").strip()
        #print url
        if parser.is_already_scraped(url):
            True


#            print "Skipping already scraped " + url
        else:
            #            print "Will process " + url
            process_pdf(parser, url, errors)
def process_journal_pdfs(parser, listurl, errors):
#    print "Finding PDFs on " + listurl
#    u = urllib.parse.urlparse(listurl)
    html = scraperwiki.scrape(listurl)
    root = lxml.html.fromstring(html)
    html = None
    for ahref in root.cssselect("table a"):
        href = ahref.attrib['href']
        url = urlparse.urljoin(listurl, href).replace(" ", "%20")
        if -1 != href.find("file://") or -1 == url.find(".pdf"):
#            print "Skipping non-http URL " + url
            continue
        if parser.is_already_scraped(url):
            True
#            print "Skipping already scraped " + url
        else:
#            print "Will process " + url
            process_pdf(parser, url, errors)
Пример #21
0
def process_journal_pdfs(parser, listurl, errors):
#    print "Finding PDFs on " + listurl
#    u = urllib.parse.urlparse(listurl)
    html = scraperwiki.scrape(listurl)
    root = lxml.html.fromstring(html)
    html = None
    for ahref in root.cssselect("table a"):
        href = ahref.attrib['href']
        url = urlparse.urljoin(listurl, href)
        if -1 != href.find("file://") or -1 != href.find("mailto:"):
#            print "Skipping non-http URL " + url
            continue
        if parser.is_already_scraped(url):
            True
#            print "Skipping already scraped " + url
        else:
#            print "Will process " + url
            process_pdf(parser, url, errors)
def process_journal_pdfs(parser, listurl, errors):
    #    print "Finding PDFs on " + listurl
    #    u = urllib.parse.urlparse(listurl)
    html = scraperwiki.scrape(listurl)
    root = lxml.html.fromstring(html)
    html = None
    for ahref in root.cssselect("div.main a"):
        href = ahref.attrib['href']
        url = urlparse.urljoin(listurl, href)
        if -1 == url.find("postliste-"):
            continue
        if parser.is_already_scraped(url):
            True


#            print "Skipping already scraped " + url
        else:
            #            print "Will process " + url
            process_pdf(parser, url, errors)
def find_journal_pdfs(parser, listurl):
#    print "Finding PDFs on " + listurl
    html = postlistelib.fetch_url_harder(listurl)

    root = lxml.html.fromstring(html)
    pdfurls = []
    for ahref in root.cssselect("div.mainbody a"):
        href = ahref.attrib['href']
        url = urlparse.urljoin(listurl, href)
        if -1 != href.find("file://"):
#            print "Skipping non-http URL " + url
            continue
        if parser.is_already_scraped(url):
            True
#            print "Skipping already scraped " + url
        else:
#            print "Will process " + url
            pdfurls.append(url)
    return pdfurls
def process_pdf_links_cssselect(parser, listurl, errors, cssselect):
    html = scraperwiki.scrape(listurl)
    root = lxml.html.fromstring(html)
    html = None
    for ahref in root.cssselect(cssselect + " a"):
        href = ahref.attrib['href']
        url = urlparse.urljoin(listurl, href).replace(" ", "%20").replace(u"å", "%C3%A5")
        #print url
        if -1 != href.find("file://") or -1 != href.find("postliste/Documents/Brukerveiledning"):
#            print "Skipping non-http URL " + url
            continue
        if -1 == href.find(".pdf"):
            continue
        if parser.is_already_scraped(url):
            True
#            print "Skipping already scraped " + url
        else:
#            print "Will process " + url
            process_pdf(parser, url, errors)
Пример #25
0
def process_journal_pdfs(parser, listurl, errors):
#    print "Finding PDFs on " + listurl
#    u = urllib.parse.urlparse(listurl)
    html = scraperwiki.scrape(listurl)
    root = lxml.html.fromstring(html)
    html = None
    for ahref in root.cssselect("div#maincontent a"):
        if 'id' not in ahref.attrib or -1 == ahref.attrib['id'].find("archiveimage_"):
            continue
        href = ahref.attrib['href']
        url = urlparse.urljoin(listurl, href)
        #print "found url %s" %url
        if -1 != href.find("file://"):
#            print "Skipping non-http URL " + url
            continue
        if parser.is_already_scraped(url):
            True
#            print "Skipping already scraped " + url
        else:
#            print "Will process " + url
            process_pdf(parser, url, errors)
Пример #26
0
def find_journal_pdfs(parser, listurl):
    #    print "Finding PDFs on " + listurl
    html = postlistelib.fetch_url_harder(listurl)

    root = lxml.html.fromstring(html)
    pdfurls = []
    for ahref in root.cssselect("div.mainbody a"):
        href = ahref.attrib['href']
        url = urlparse.urljoin(listurl, href)
        if -1 != href.find("file://"):
            #            print "Skipping non-http URL " + url
            continue
        if parser.is_already_scraped(url):
            True


#            print "Skipping already scraped " + url
        else:
            #            print "Will process " + url
            pdfurls.append(url)
    return pdfurls
def process_journal_pdfs(parser, listurl, errors):
    print "Finding PDFs on " + listurl
    html = scraperwiki.scrape(listurl)
    root = lxml.html.fromstring(html)
    html = None
    for ahref in root.cssselect("div.text_content table tr td a"):
        href = ahref.attrib['href']
        url = urlparse.urljoin(listurl, href).replace(" ", "+")
        if (-1 == url.find("mnd.pdf")) and (-1 == url.find("dag.pdf")):
            continue

        # We get the ETag since the url does not change
        request = urllib2.Request(url)
        request.get_method = lambda : 'HEAD'
        response = urllib2.urlopen(request)
        #print response.info()
        etag = base64.b64encode(response.info().getheader("ETag"))
        url = "%s#%s" % (url, etag) # Yes, the etag is surrounded by ""
        if parser.is_already_scraped(url):
            continue
        else:
            print "Prosessing %s" % url
            process_pdf(parser, url, errors)
def process_journal_pdfs(parser, listurl, errors):
    #    print "Finding PDFs on " + listurl
    #    u = urllib.parse.urlparse(listurl)
    #http://www.ostfold-f.kommune.no/offjournal_show_index.asp?tmpSearch=1
    for week in range(1, 53):
        params = urllib.urlencode({
            'tmpSearch': str(week),
            'btnSubmit': 'Vis',
            'strHandling': 'uke'
        })

        headers = {
            "Content-type": "application/x-www-form-urlencoded",
            "Accept": "text/html"
        }
        conn = httplib.HTTPConnection("www.ostfold-f.kommune.no:80")
        conn.request("POST", "/offjournal_show_index.asp", params, headers)
        response = conn.getresponse()
        html = response.read()
        conn.close()
        root = lxml.html.fromstring(html)
        html = None
        for ahref in root.cssselect("table a"):
            href = ahref.attrib['href']
            url = urlparse.urljoin(listurl, href)
            if -1 != href.find("file://") or -1 == url.find(".pdf"):
                #            print "Skipping non-http URL " + url
                continue
            if parser.is_already_scraped(url):
                True


#            print "Skipping already scraped " + url
            else:
                #            print "Will process " + url
                process_pdf(parser, url, errors)
Пример #29
0
def process_pdf_links_cssselect(parser, listurl, errors, cssselect):
    html = scraperwiki.scrape(listurl)
    root = lxml.html.fromstring(html)
    html = None
    for ahref in root.cssselect(cssselect + " a"):
        href = ahref.attrib['href']
        url = urlparse.urljoin(listurl,
                               href).replace(" ",
                                             "%20").replace(u"å", "%C3%A5")
        #print url
        if -1 != href.find("file://") or -1 != href.find(
                "postliste/Documents/Brukerveiledning"):
            #            print "Skipping non-http URL " + url
            continue
        if -1 == href.find(".pdf"):
            continue
        if parser.is_already_scraped(url):
            True


#            print "Skipping already scraped " + url
        else:
            #            print "Will process " + url
            process_pdf(parser, url, errors)