def test_small_pdfs(parser): # Test with some smaller PDFs errors = [] #parser.debug = True newurl = "http://www.saltdal.kommune.no/images/module.files/010612.pdf" if not parser.is_already_scraped(newurl): process_pdf(parser, newurl, errors) # New format if parser.is_already_scraped(newurl): print "Already parsed" else: raise ValueError("Failed to parse") # process_pdf(parser, "http://www.saltdal.kommune.no/images/module.files/2007-01-31.pdf", errors) # Old format process_page_queue(parser, errors) report_errors(errors) exit(0)
def process_journal_pdfs(parser, listurl, errors): # print "Finding PDFs on " + listurl # u = urllib.parse.urlparse(listurl) # http://www.ostfold-f.kommune.no/offjournal_show_index.asp?tmpSearch=1 for week in range(1, 53): params = urllib.urlencode({"tmpSearch": str(week), "btnSubmit": "Vis", "strHandling": "uke"}) headers = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/html"} conn = httplib.HTTPConnection("www.ostfold-f.kommune.no:80") conn.request("POST", "/offjournal_show_index.asp", params, headers) response = conn.getresponse() html = response.read() conn.close() root = lxml.html.fromstring(html) html = None for ahref in root.cssselect("table a"): href = ahref.attrib["href"] url = urlparse.urljoin(listurl, href) if -1 != href.find("file://") or -1 == url.find(".pdf"): # print "Skipping non-http URL " + url continue if parser.is_already_scraped(url): True # print "Skipping already scraped " + url else: # print "Will process " + url process_pdf(parser, url, errors)
def process_journal_pdfs(parser, listurl, errors): # print "Finding PDFs on " + listurl # u = urllib.parse.urlparse(listurl) html = scraperwiki.scrape(listurl) root = lxml.html.fromstring(html) html = None for ahref in root.cssselect("table a"): href = ahref.attrib['href'] url = urlparse.urljoin(listurl, href) if -1 != href.find("file://") or -1 == url.find("/postliste-"): # print "Skipping non-http URL " + url continue if parser.is_already_scraped(url): True # print "Skipping already scraped " + url else: # print "Will process " + url process_pdf(parser, url, errors) # Follow the "next page" link to the end for ahref in root.cssselect("center a"): href = ahref.attrib['href'] url = urlparse.urljoin(listurl, href) print ahref.text, url if -1 != ahref.text.find("Neste side"): process_journal_pdfs(parser, url, errors)
def process_journal_pdfs(parser, listurl, errors): # print "Finding PDFs on " + listurl # u = urllib.parse.urlparse(listurl) html = scraperwiki.scrape(listurl) root = lxml.html.fromstring(html) html = None for ahref in root.cssselect("table#hovedinnhold a"): href = ahref.attrib['href'] url = urlparse.urljoin(listurl, href).replace(" ", "+") if -1 != href.find("file://"): # print "Skipping non-http URL " + url continue if -1 == url.find(".pdf"): continue # Special case, file indicating no journal entries this day if "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/CA6C83764E56DDCBC1257A02003F9025/$FILE/Postjournal+11.05.12.pdf" == url or \ "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/7FD82A18C1A1F137C12579F90029DEBD/$FILE/Postjournal+07.05.12.pdf" == url or \ "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/777B497BB48936ACC1257A450033E1D4/$FILE/Postjournal+20.07.12.pdf" == url or \ "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/1802A0FF57C08EFEC1257A4500337345/$FILE/Postjournal+16.07.12.pdf" == url or \ "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/90373A38701C27E5C1257A45002F63FD/$FILE/Postjournal+12.07.12.pdf" == url or \ "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/6B00A3BD92B3C2AEC1257A45002F4044/$FILE/Postjournal+10.07.12.pdf" == url or \ "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/0141B5488D38B8FEC1257A44003756ED/$FILE/Postjournal+06.07.12.pdf" == url: continue if parser.is_already_scraped(url): True # print "Skipping already scraped " + url else: # print "Will process " + url process_pdf(parser, url, errors)
def process_journal_pdfs(parser, listurl, errors): # print "Finding PDFs on " + listurl # u = urllib.parse.urlparse(listurl) html = scraperwiki.scrape(listurl) root = lxml.html.fromstring(html) html = None for ahref in root.cssselect("div#placeholder-content-main-left-column a"): href = ahref.attrib['href'] url = urlparse.urljoin(listurl, href) if -1 != href.find("file://") or -1 == url.find('/postjournal/article'): # print "Skipping non-http URL " + url continue subhtml = scraperwiki.scrape(url) subroot = lxml.html.fromstring(subhtml) subhtml = None for subahref in subroot.cssselect("div.article-content a"): subhref = subahref.attrib['href'] suburl = urlparse.urljoin(listurl, subhref) if -1 == suburl.find(".pdf"): continue if parser.is_already_scraped(suburl): True # print "Skipping already scraped " + suburl else: # print "Will process " + suburl process_pdf(parser, suburl, errors)
def process_journal_pdf_directory(parser, listurl, errors): #html = scraperwiki.scrape(listurl) html = lazycache.lazycache(listurl) root = lxml.html.fromstring(html) html = None pdflisturls = [] for ahref in root.cssselect("span.vrtx-paging-wrapper a"): href = ahref.attrib['href'] url = urlparse.urljoin(listurl, href) pdflisturls.append(url) # print pdflisturls for listurl in pdflisturls: html = scraperwiki.scrape(listurl) root = lxml.html.fromstring(html) html = None urlseen = {} for ahref in root.cssselect("div.vrtx-resource a"): href = ahref.attrib['href'] url = urlparse.urljoin(listurl, href) if -1 == url.find(".pdf"): continue # Ignore duplicates with M: as part of the name if -1 != url.find("/M%"): continue if url in urlseen or parser.is_already_scraped(url): True # print "Skipping already scraped " + url else: # print "Will process " + url process_pdf(parser, url, errors) urlseen[url] = 1
def process_journal_pdfs(parser, listurl, errors): # print "Finding PDFs on " + listurl # u = urllib.parse.urlparse(listurl) html = scraperwiki.scrape(listurl) root = lxml.html.fromstring(html) html = None for ahref in root.cssselect("div.month-entry-title a"): href = ahref.attrib['href'] url = urlparse.urljoin(listurl, href) print url if -1 != href.find("file://"): # print "Skipping non-http URL " + url continue subhtml = scraperwiki.scrape(url) subroot = lxml.html.fromstring(subhtml) subhtml = None for subahref in subroot.cssselect("div.related-attachements a"): subhref = subahref.attrib['href'] suburl = urlparse.urljoin(url, subhref) if -1 == suburl.find(".pdf"): continue if parser.is_already_scraped(suburl): True # print "Skipping already scraped " + url else: # print "Will process " + url process_pdf(parser, suburl, errors)
def process_journal_pdfs(parser, listurl, errors): # print "Finding PDFs on " + listurl # u = urllib.parse.urlparse(listurl) html = scraperwiki.scrape(listurl) root = lxml.html.fromstring(html) html = None for ahref in root.cssselect("table a"): href = ahref.attrib["href"] url = urlparse.urljoin(listurl, href) if -1 != href.find("file://") or -1 == url.find("/postliste-"): # print "Skipping non-http URL " + url continue if parser.is_already_scraped(url): True # print "Skipping already scraped " + url else: # print "Will process " + url process_pdf(parser, url, errors) # Follow the "next page" link to the end for ahref in root.cssselect("center a"): href = ahref.attrib["href"] url = urlparse.urljoin(listurl, href) print ahref.text, url if -1 != ahref.text.find("Neste side"): process_journal_pdfs(parser, url, errors)
def consider_url(parser, url, errors): if parser.is_already_scraped(url): True # print "Skipping already scraped " + url else: # print "Will process " + url try: process_pdf(parser, url, errors) except: pass
def process_pdf(parser, pdfurl, errors): if parser.is_already_scraped(pdfurl): return postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) try: pdfcontent = scraperwiki.scrape(pdfurl) parser.preprocess(pdfurl, pdfcontent) pdfcontent = None except ValueError, e: print e errors.append(e)
def process_journal_pdfs(parser, listurl, errors): print "Finding PDFs on " + listurl html = scraperwiki.scrape(listurl) root = lxml.html.fromstring(html) html = None for ahref in root.cssselect("div#placeholder-content td.name a"): href = ahref.attrib["href"] url = urlparse.urljoin(listurl, href).replace(" ", "+") if -1 != href.find("file://") or -1 == url.find(".pdf"): continue if parser.is_already_scraped(url): continue else: process_pdf(parser, url, errors)
def process_journal_pdfs(parser, listurl, errors): print "Finding PDFs on " + listurl html = scraperwiki.scrape(listurl) root = lxml.html.fromstring(html) html = None for ahref in root.cssselect("div#placeholder-content td.name a"): href = ahref.attrib['href'] url = urlparse.urljoin(listurl, href).replace(" ", "+") if -1 != href.find("file://") or -1 == url.find(".pdf"): continue if parser.is_already_scraped(url): continue else: process_pdf(parser, url, errors)
def process_journal_pdfs(parser, listurl, errors): print "Finding PDFs on " + listurl # u = urllib.parse.urlparse(listurl) xml = scraperwiki.scrape(listurl) root = lxml.html.fromstring(xml) xml = None for link in root.cssselect("hendelse link"): url = lxml.html.tostring(link).replace("<link>", "").strip() #print url if parser.is_already_scraped(url): True # print "Skipping already scraped " + url else: # print "Will process " + url process_pdf(parser, url, errors)
def process_journal_pdfs(parser, listurl, errors): print "Finding PDFs on " + listurl html = scraperwiki.scrape(listurl) root = lxml.html.fromstring(html) html = None for ahref in root.cssselect("div#Hovedspalte div.tekst a.pdf"): href = ahref.attrib['href'] url = urlparse.urljoin(listurl, href).replace(" ", "+") #if -1 != href.find("file://") or -1 == url.find(".pdf"): # we select pdf based on css class # print "ignoring %s" % url # continue if parser.is_already_scraped(url): continue else: #print url process_pdf(parser, url, errors)
def process_journal_pdfs(parser, listurl, errors): # print "Finding PDFs on " + listurl # u = urllib.parse.urlparse(listurl) html = scraperwiki.scrape(listurl) root = lxml.html.fromstring(html) html = None for ahref in root.cssselect("div.main a"): href = ahref.attrib['href'] url = urlparse.urljoin(listurl, href) if -1 == url.find("postliste-"): continue if parser.is_already_scraped(url): True # print "Skipping already scraped " + url else: # print "Will process " + url process_pdf(parser, url, errors)
def process_journal_pdfs(parser, listurl, errors): # print "Finding PDFs on " + listurl # u = urllib.parse.urlparse(listurl) html = scraperwiki.scrape(listurl) root = lxml.html.fromstring(html) html = None for ahref in root.cssselect("table a"): href = ahref.attrib['href'] url = urlparse.urljoin(listurl, href).replace(" ", "%20") if -1 != href.find("file://") or -1 == url.find(".pdf"): # print "Skipping non-http URL " + url continue if parser.is_already_scraped(url): True # print "Skipping already scraped " + url else: # print "Will process " + url process_pdf(parser, url, errors)
def process_journal_pdfs(parser, listurl, errors): # print "Finding PDFs on " + listurl # u = urllib.parse.urlparse(listurl) html = scraperwiki.scrape(listurl) root = lxml.html.fromstring(html) html = None for ahref in root.cssselect("table a"): href = ahref.attrib['href'] url = urlparse.urljoin(listurl, href) if -1 != href.find("file://") or -1 != href.find("mailto:"): # print "Skipping non-http URL " + url continue if parser.is_already_scraped(url): True # print "Skipping already scraped " + url else: # print "Will process " + url process_pdf(parser, url, errors)
def find_journal_pdfs(parser, listurl): # print "Finding PDFs on " + listurl html = postlistelib.fetch_url_harder(listurl) root = lxml.html.fromstring(html) pdfurls = [] for ahref in root.cssselect("div.mainbody a"): href = ahref.attrib['href'] url = urlparse.urljoin(listurl, href) if -1 != href.find("file://"): # print "Skipping non-http URL " + url continue if parser.is_already_scraped(url): True # print "Skipping already scraped " + url else: # print "Will process " + url pdfurls.append(url) return pdfurls
def process_pdf_links_cssselect(parser, listurl, errors, cssselect): html = scraperwiki.scrape(listurl) root = lxml.html.fromstring(html) html = None for ahref in root.cssselect(cssselect + " a"): href = ahref.attrib['href'] url = urlparse.urljoin(listurl, href).replace(" ", "%20").replace(u"å", "%C3%A5") #print url if -1 != href.find("file://") or -1 != href.find("postliste/Documents/Brukerveiledning"): # print "Skipping non-http URL " + url continue if -1 == href.find(".pdf"): continue if parser.is_already_scraped(url): True # print "Skipping already scraped " + url else: # print "Will process " + url process_pdf(parser, url, errors)
def process_journal_pdfs(parser, listurl, errors): # print "Finding PDFs on " + listurl # u = urllib.parse.urlparse(listurl) html = scraperwiki.scrape(listurl) root = lxml.html.fromstring(html) html = None for ahref in root.cssselect("div#maincontent a"): if 'id' not in ahref.attrib or -1 == ahref.attrib['id'].find("archiveimage_"): continue href = ahref.attrib['href'] url = urlparse.urljoin(listurl, href) #print "found url %s" %url if -1 != href.find("file://"): # print "Skipping non-http URL " + url continue if parser.is_already_scraped(url): True # print "Skipping already scraped " + url else: # print "Will process " + url process_pdf(parser, url, errors)
def process_journal_pdfs(parser, listurl, errors): print "Finding PDFs on " + listurl html = scraperwiki.scrape(listurl) root = lxml.html.fromstring(html) html = None for ahref in root.cssselect("div.text_content table tr td a"): href = ahref.attrib['href'] url = urlparse.urljoin(listurl, href).replace(" ", "+") if (-1 == url.find("mnd.pdf")) and (-1 == url.find("dag.pdf")): continue # We get the ETag since the url does not change request = urllib2.Request(url) request.get_method = lambda : 'HEAD' response = urllib2.urlopen(request) #print response.info() etag = base64.b64encode(response.info().getheader("ETag")) url = "%s#%s" % (url, etag) # Yes, the etag is surrounded by "" if parser.is_already_scraped(url): continue else: print "Prosessing %s" % url process_pdf(parser, url, errors)
def process_journal_pdfs(parser, listurl, errors): # print "Finding PDFs on " + listurl # u = urllib.parse.urlparse(listurl) #http://www.ostfold-f.kommune.no/offjournal_show_index.asp?tmpSearch=1 for week in range(1, 53): params = urllib.urlencode({ 'tmpSearch': str(week), 'btnSubmit': 'Vis', 'strHandling': 'uke' }) headers = { "Content-type": "application/x-www-form-urlencoded", "Accept": "text/html" } conn = httplib.HTTPConnection("www.ostfold-f.kommune.no:80") conn.request("POST", "/offjournal_show_index.asp", params, headers) response = conn.getresponse() html = response.read() conn.close() root = lxml.html.fromstring(html) html = None for ahref in root.cssselect("table a"): href = ahref.attrib['href'] url = urlparse.urljoin(listurl, href) if -1 != href.find("file://") or -1 == url.find(".pdf"): # print "Skipping non-http URL " + url continue if parser.is_already_scraped(url): True # print "Skipping already scraped " + url else: # print "Will process " + url process_pdf(parser, url, errors)
def process_pdf_links_cssselect(parser, listurl, errors, cssselect): html = scraperwiki.scrape(listurl) root = lxml.html.fromstring(html) html = None for ahref in root.cssselect(cssselect + " a"): href = ahref.attrib['href'] url = urlparse.urljoin(listurl, href).replace(" ", "%20").replace(u"å", "%C3%A5") #print url if -1 != href.find("file://") or -1 != href.find( "postliste/Documents/Brukerveiledning"): # print "Skipping non-http URL " + url continue if -1 == href.find(".pdf"): continue if parser.is_already_scraped(url): True # print "Skipping already scraped " + url else: # print "Will process " + url process_pdf(parser, url, errors)