def rp_table(): url = "http://www.atoc.org/clientfiles/File/routeing_points.pdf" print "Processing routeing point list" pdfdata = urllib2.urlopen(url).read() print "The pdf file has %d bytes" % len(pdfdata) print "Converting to XML" xmldata = scraperwiki.pdftoxml(pdfdata) #print "After converting to xml it has %d bytes" % len(xmldata) #print "The first 20000 characters are: ", xmldata[:20000] scraperwiki.sqlite.execute('DROP TABLE IF EXISTS rp_maps') scraperwiki.sqlite.execute('CREATE TABLE rp_maps (routeing_point, mapname)') print "Processing XML" # This is horrid, and assumes that the PDF will be in the correct order. for _, cell in lxml.etree.iterparse(StringIO.StringIO(xmldata), tag='text'): if int(cell.attrib['top']) > 100: if cell.attrib['left'] == '38': rp = cell.xpath('string()').title() else: for mapname in cell.xpath('string()').split(): scraperwiki.sqlite.execute('INSERT INTO rp_maps VALUES (?, ?)', (rp, mapname)) cell.clear() print "Creating indexes" scraperwiki.sqlite.execute('CREATE INDEX maps_byrp ON rp_maps(routeing_point)') scraperwiki.sqlite.execute('CREATE INDEX rps_bymap ON rp_maps(mapname)') print "Committing" scraperwiki.sqlite.commit() print "Routeing point list processed"
def do_it(pdfurl): ''' Take the URL of a PDF, and use scraperwiki.pdftoxml and lxml to output the contents as a styled HTML div. ''' pdfdata = urllib2.urlopen(pdfurl).read() pdfxml = scraperwiki.pdftoxml(pdfdata) root = lxml.etree.fromstring(pdfxml) # just turn it into plain text raw = '' for index, page in enumerate(root): for text in page: raw += ' '.join(text.xpath("descendant-or-self::text()")) raw += "\n" # pull out the grades data = {} grades = extract_grades(raw) for i in range(1,8): data['key_question_grade_%d'%(i,)] = grades[i] data['date_of_inspection'] = extract_date(raw) return data
def getpages(href): pdfdata = scraperwiki.scrape(href) xml = scraperwiki.pdftoxml(pdfdata) dom = lxml.etree.fromstring(xml) pages = list(dom) print "The pages are numbered:", [page.attrib.get("number") for page in pages] return pages
def getpdfs(): html = parse('http://www.safaricom.co.ke/index.php?id=275').getroot() html.make_links_absolute() pdf_urls = html.xpath('//table[@class="contenttable" and @width="540"]/descendant::a/@href') for url in pdf_urls: save(['date_scraped', 'url'], {"date_scraped": DATE, "url": url, "pdfxml": pdftoxml(urlopen(url).read())}, 'pdfs')
def pdfGrabber(typ): #if src =='f1mediacentre': url = "http://www.fia.com/en-GB/mediacentre/f1_media/Documents/"+race+"-"+typ+".pdf" #http://184.106.145.74/fia-f1/f1-2012/f1-2012-08/eur-f1-2012-fp1-times.pdf #http://184.106.145.74/fia-f1/f1-2012/f1-2012-08/eur-f1-2012-fp1-classification.pdf ##trying http://184.106.145.74/fia-f1/f1-2012/f1-2012-08/eur-fp1-classification.pdf rnum='08' typ2=typ.replace('session','fp') if src =='f1mediacentre': url = "http://184.106.145.74/fia-f1/f1-2012/f1-2012-"+rnum+"/"+race+"-f1-2012-"+typ2+".pdf" else: url="http://dl.dropbox.com/u/1156404/"+race+"-"+typ+".pdf" #url='http://dl.dropbox.com/u/1156404/mal-race-analysis.pdf' pdfdata = urllib2.urlopen(url).read() print "The pdf file has %d bytes" % len(pdfdata) xmldata = scraperwiki.pdftoxml(pdfdata) ''' print "After converting to xml it has %d bytes" % len(xmldata) print "The first 2000 characters are: ", xmldata[:2000] ''' root = lxml.etree.fromstring(xmldata) pages = list(root) #print 'pre',pages print "The pages are numbered:", [ page.attrib.get("number") for page in pages ] return pages
def parseReport(pdfurl,urn): ''' Take the URL of a PDF, and use scraperwiki.pdftoxml and lxml to output the contents as a styled HTML div. ''' try: pdfdata = urllib2.urlopen(pdfurl).read() if pdfdata == '': return "Failed to load/PDF does not exist" pdfxml = scraperwiki.pdftoxml(pdfdata) root = lxml.etree.fromstring(pdfxml) reportdata = [] #print "URN %s URL %" % (pdfurl,urn) # Print each page of the PDF. for index, page in enumerate(root): data = PageSave(page, index,urn) reportdata.append(data) for ldata in data: #print data lldata = ldata.copy() lldata["urm"] = urn scraperwiki.sqlite.save(unique_keys=ldata.keys(), data=lldata, table_name="other") #print reportdata report = {'urn':urn, 'data':reportdata} print report scraperwiki.sqlite.save(unique_keys=["urn"], data=report) return "Success" except Exception, e: return "Error %s" % e
def Main(pdfurl): ''' Take the URL of a PDF, and use scraperwiki.pdftoxml and lxml to output the contents as a styled HTML div. ''' pdfdata = urllib2.urlopen(pdfurl).read() pdfxml = scraperwiki.pdftoxml(pdfdata) root = lxml.etree.fromstring(pdfxml) global styles fontspecs = { } # Get the PDF's internal styles: we'll use these to style the divs containing the PDF. for fontspec in root.xpath('page/fontspec'): id = fontspec.attrib.get('id') fontdesc = {'size':int(fontspec.attrib.get('size')), 'family':fontspec.attrib.get('family'), 'color':fontspec.attrib.get('color')} fontspecs[id] = fontdesc styles['div.fontspec-%s' % id] = 'color:%s;font-family:%s;font-size:%dpx' % (fontdesc['color'], fontdesc['family'], fontdesc['size']) # Output the view, with instructions for the user. print '<html dir="ltr" lang="en">' print '<head>' print ' <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>' print ' <title>PDF to XML text positioning</title>' print ' <style type="text/css" media="screen">%s</style>' % "\n".join([ "%s { %s }" % (k, v) for k, v in styles.items() ]) print ' <script type="text/javascript" src="http://ajax.googleapis.com/ajax/libs/jquery/1.4.2/jquery.min.js"></script>' print ' <script>%s</script>' % jscript print '</head>' # Print each page of the PDF. for index, page in enumerate(root): print Pageblock(page, index)
def preprocess(self, pdfurl, pdfcontent): print "Preprocessing PDF " + pdfurl if not pdfcontent: raise ValueError("No pdf content passed for " + pdfurl) if self.hiddentext: options = '-hidden' else: options = '' xml=scraperwiki.pdftoxml(pdfcontent, options) if self.debug: print xml pages=re.findall('(<page .+?</page>)',xml,flags=re.DOTALL) xml=None # print pages[:1][:1000] pagecount = 0 datastore = [] for page in pages: pagecount = pagecount + 1 self.is_valid_page(pdfurl, pagecount, page) data = { 'scrapedurl' : pdfurl, 'pagenum' : pagecount, 'pagecontent' : page, } datastore.append(data) if 0 < len(datastore): scraperwiki.sqlite.save(unique_keys=['scrapedurl', 'pagenum'], data=datastore, table_name=self.pagetable) else: raise ValueError("Unable to find any pages in " + pdfurl) pages = None
def scrapereport(reportlink): boldline = 0 html = scraperwiki.scrape(baseurl+reportlink) root = lxml.html.fromstring(html) links = root.cssselect("div#unusefulbottom a") #<div id="unusefulbottom"> for link in links: print "LINK GRABBED WITH CSSSELECT", link print "link.attrib.get", link.attrib.get('href') downloadlink = link.attrib.get('href') # print " downloadlink[0].text_content()", downloadlink[0].text_content() pdfdata = urllib2.urlopen(baseurl+downloadlink).read() print "pdfdata", pdfdata xmldata = scraperwiki.pdftoxml(pdfdata) print "xmldata", xmldata pdfxml = lxml.etree.fromstring(xmldata) print "pdfxml", pdfxml boldtags = pdfxml.xpath('.//text') linenumber = 0 for heading in boldtags: linenumber = linenumber+1 #print "Heading:", heading.text if heading.text is not None: # mention = re.match(r'.*NMS.*',heading.text) mention = re.match(r'.*overall.*',heading.text) if mention: print "FULL LINE", lxml.etree.tostring(heading, encoding="unicode", method="text") # print "OVERALL", heading.text # print "CHECK", pdfxml.xpath('.//text')[linenumber-1].text # print "LINEAFTER", pdfxml.xpath('.//text')[linenumber].text record['overall'] = lxml.etree.tostring(heading, encoding="unicode", method="text") record['uniqueref'] = reportlink+"_"+str(boldline) record['downloadlink'] = baseurl+downloadlink scraperwiki.sqlite.save(['uniqueref'],record)
def read_file_return_etree(uid): with open('cached_pdfs/{}.pdf'.format(uid), 'r') as f: pdfdata = f.read() # str xmldata = scraperwiki.pdftoxml(pdfdata) # unicode xmldata = bytes(bytearray(xmldata, encoding='utf-8')) # str element_tree = ET.fromstring(xmldata) return element_tree
def scrape_cieisp(year, text): if (year == 2010): pdfurl = "http://www.secretariadoejecutivosnsp.gob.mx/work/models/SecretariadoEjecutivo/Resource/131/1/images/INCIDENCIA_DELICTIVA_2010_030211.pdf" else: pdfurl = "http://www.secretariadoejecutivosnsp.gob.mx/work/models/SecretariadoEjecutivo/Resource/131/1/images/CIEISP" + `year` + ".pdf" a = scraperwiki.scrape(pdfurl) s = BeautifulSoup(scraperwiki.pdftoxml(a)) dolosos_position = [] i = 0 for t in s.findAll('text'): if t.text == "DOLOSOS": if text == "POR ARMA DE FUEGO": dolosos_position.append(i+14) else: dolosos_position.append(i) i += 1 all_text = s.findAll('text') #print all_text if (year <= 2008) : if (year >=2006): states_names = states3 else: states_names = states2 else: states_names = states for i in range(0,33): for j in range(1,14): record = {'State' : states_names[i], 'Year' : year, 'Month' : months[j-1], 'Homicides' : all_text[dolosos_position[i]+j].text, 'Crimetype' : text} scraperwiki.datastore.save(["State", "Year", "Month"], record) return
def getTablePages(): url = "http://www.bnr.rw/docs/publicnotices/List%20of%20MFIs%20Update_Sept_%202011.pdf" pdfdata = urlopen(url).read() xmldata = pdftoxml(pdfdata) root = fromstring(xmldata) pages = list(root) return pages
def pdfParser(pdfdata,path): txt=[] pdfdata = urllib2.urlopen(url).read() xmldata = scraperwiki.pdftoxml(pdfdata) root = lxml.etree.fromstring(xmldata) # 4. Have a peek at the XML (click the "more" link in the Console to preview it). #print lxml.etree.tostring(root, pretty_print=True) # 5. How many pages in the PDF document? pages = list(root) #print "There are",len(pages),"pages" # 6. Iterate through the elements in each page, and preview them for page in pages: for el in page: if el.tag == "text": #print el.text, el.attrib if el.text!=None: txt.append(el.text) try: ftxt=open(path+'/text.txt','w') ftxt.write("\n".join(txt).encode('utf-8')) ftxt.close() except: pass
def urltohtml(url="http://www.madingley.org/uploaded/Hansard_08.07.2010.pdf"): import scraperwiki, urllib2, lxml.etree lazycache=scraperwiki.swimport('lazycache') pdfdata = lazycache.lazycache(url) xmldata = scraperwiki.pdftoxml(pdfdata) root = lxml.etree.fromstring(xmldata) pages = list(root) # this function has to work recursively because we might have "<b>Part1 <i>part 2</i></b>" def gettext_with_bi_tags(el): res = [ ] if el.text: res.append(el.text) for lel in el: res.append("<%s>" % lel.tag) res.append(gettext_with_bi_tags(lel)) res.append("</%s>" % lel.tag) if el.tail: res.append(el.tail) return "".join(res) # print the first hundred text elements from the first page text=[] for page in pages: for el in list(page)[:100]: if el.tag == "text": text.append(gettext_with_bi_tags(el)) return '\n'.join(text)
def fetch_rows(url, x_threshold): points_rows = [] districts_rows = [] print url f = requests.get(url) pdf = scraperwiki.pdftoxml(f.content) root = lxml.etree.fromstring(pdf) pages = root.xpath("//page") for page in pages: page_number = int(page.xpath("./@number")[0]) texts = page.xpath("./text") tag = 0 for text in texts: x = int(text.xpath("./@left")[0]) y = int(text.xpath("./@top")[0]) if text.text is not None: s = re.sub(r'\d+\.', '', text.text).strip() m = re.match(r'\d+\.', text.text) if m is not None: tag = int(m.group(0).strip()[0:-1]) if len(s) == 0: continue d = {'text': s, 'y': y, 'x': x, 'page': page_number, 'tag': tag} if x >= x_threshold: points_rows.append(d) else: districts_rows.append(d) return (points_rows, districts_rows)
def parse_pdf(url): pdf_data = urllib2.urlopen(url).read() assert len(pdf_data) > 0 xml_data = sw.pdftoxml(pdf_data) tree = etree.parse(StringIO(xml_data)) root = tree.getroot() print root.xpath('//*text[left<200]')
def GetPDFtrans(): pdfurl = "http://www.birmingham.gov.uk/cs/Satellite?%26ssbinary=true&blobcol=urldata&blobheader=application%2Fpdf&blobheadername1=Content-Disposition&blobkey=id&blobtable=MungoBlobs&blobwhere=1223439077563&blobheadervalue1=attachment%3B+filename%3D444523Payments+over+%C2%A3500+August.pdf" c = urllib.urlopen(pdfurl).read() x = scraperwiki.pdftoxml(c) print x[:4000] urlup = "http://seagrass.goatchurch.org.uk/~julian/cgi-bin/uu.cgi" d = urllib.urlencode({"name":"brumpdf500xml", "contents":x}) print urllib.urlopen(urlup, d).read()
def scrapeschool(url): print url html = scraperwiki.scrape(url) print html root = lxml.html.fromstring(html) #create an empty variable 'record', which is a dictionary record = {} #create a uniqueid that we'll add to with each record later uniqueid = 0 record["school"] = root.cssselect("h1")[0].text_content() record["parentviewurl"] = root.xpath(".//div[@id='content']//a")[0].attrib.get('href') #Expressed more simply, this could take up three lines like so: # parentviewurls = root.xpath(".//div[@id='content']//a") # parentviewurl = parentviewurls[0].attrib.get('href') # record["parentviewurl"] = parentviewurl record["URN"] = root.xpath(".//div[@id='content']//p//strong")[0].text_content() record["Address"] = lxml.etree.tostring(root.xpath(".//div[@id='content']//p")[1]) report1url = root.xpath(".//table[@summary='Previous reports']//td//a")[0].attrib.get('href') record["report1url"] = report1url # record["inspectiondate"] = root.xpath(".//table[@summary='Previous reports']//td")[1].text_content uniqueid =+ 1 record["uniqueid"] = uniqueid print record #use the urllib2 library's .urlopen function to open the full PDF URL, and the .read() function to read it into a new object, 'pdfdata' pdfdata = urllib2.urlopen(baseurl+report1url).read() #use pdftoxml to convert that into an xml document pdfread = scraperwiki.pdftoxml(pdfdata) print pdfread #use lxml.etree to convert that into an lxml object pdfroot = lxml.etree.fromstring(pdfread) leadership = re.search(r'b>The quality of .* <b',pdfread) if leadership: # print linenumber print leadership.group() #find all <b> tagged lines - headings? lines = pdfroot.findall('.//text') linenumber = 0 for line in lines: linenumber = linenumber+1 if line.text: FSM = re.match(r'.* free school meals .*',line.text) if FSM: print linenumber print FSM.group() # if pdfroot.xpath('.//text')[linenumber-2].text: print pdfroot.xpath('.//text')[linenumber-2].text print pdfroot.xpath('.//text')[linenumber-1].text print pdfroot.xpath('.//text')[linenumber].text # if pdfroot.findall('.//text')[linenumber].text: record["FSM3"] = pdfroot.findall('.//text')[linenumber].text print record #UP TO HERE. NEED TO: #IDENTIFY THE LINE WE WANT - PERHAPS .XPATH AND (CONTAINS) #GRAB X CHARACTERS AFTER THAT - OR: #IDENTIFY THE INDEX POSITION OF THAT <TEXT><B> HEADING AND THE NEXT ONE AND GRAB ALL LINES BETWEEN scraperwiki.sqlite.save(["uniqueid"],record)
def get_id_period (self, date): from_iso_dt, to_iso_dt = util.inc_dt(date.strftime(util.ISO8601_DATE), util.ISO8601_DATE, self.PERIOD_TYPE) from_dt = util.get_dt(from_iso_dt, util.ISO8601_DATE) to_dt = util.get_dt(to_iso_dt, util.ISO8601_DATE) url_date = to_dt.strftime(self.search_url1) if self.DEBUG: print url_date try: response = self.br.open(url_date) except: url_date = to_dt.strftime(self.search_url2) if self.DEBUG: print url_date try: response = self.br.open(url_date) except: url_date = to_dt.strftime(self.search_url3) if self.DEBUG: print url_date try: response = self.br.open(url_date) except: response = None final_result = [] if response: html = response.read() if self.DEBUG: print html url = response.geturl() result = scrapemark.scrape(self.scrape_ids1, html, url) if not result or not result.get('records'): result = scrapemark.scrape(self.scrape_ids1a, html, url) if not result or not result.get('records'): result = scrapemark.scrape(self.scrape_ids2, html, url) if not result or not result.get('records'): pdfxml = scraperwiki.pdftoxml(html) if self.DEBUG: print pdfxml result = scrapemark.scrape(self.scrape_ids3, pdfxml, url) if result and result.get('records'): for rec in result['records']: rec['url'] = url_date rec['date_received'] = to_iso_dt rec['start_date'] = to_iso_dt if rec.get('agent1'): if rec.get('agent2'): rec['agent_address'] = rec['agent1'] + ' ' + rec['agent2'] else: rec['agent_address'] = rec['agent1'] del rec['agent1'] if 'agent2' in rec: del rec['agent2'] self.clean_ids(result['records']) for rec in result['records']: # note do this after record cleaning rec['date_scraped'] = datetime.now().strftime('%Y-%m-%dT%H:%M:%S') final_result.extend(result['records']) #else: # return [], None, None return final_result, from_dt, to_dt
def __init__(self, filename): # load the pdf with open(filename) as f: pdf_string = f.read() # convert to xml xml_string = scraperwiki.pdftoxml(pdf_string) # parse xml self._xml = lxml.etree.fromstring(xml_string) self._pages = [self._page_to_blocks(page_num) for page_num in range(1, self.page_count() + 1)]
def ConvertPDFtoSqlite(docname, pdfurl): print "converting", docname, pdfurl pdfdata = urllib2.urlopen(pdfurl).read() pdfxml = scraperwiki.pdftoxml(pdfdata) try: root = lxml.etree.fromstring(pdfxml) except lxml.etree.XMLSyntaxError, e: print "Bad xml file", str(e) print pdfxml[:19000] return
def Main(pdfurl): ''' Take the URL of a PDF, and use scraperwiki.pdftoxml and lxml to output the contents as a styled HTML div. ''' pdfdata = urllib2.urlopen(pdfurl).read() pdfxml = scraperwiki.pdftoxml(pdfdata) root = lxml.etree.fromstring(pdfxml) global styles fontspecs = { } # Get the PDF's internal styles: we'll use these to style the divs containing the PDF. for fontspec in root.xpath('page/fontspec'): id = fontspec.attrib.get('id') fontdesc = {'size':int(fontspec.attrib.get('size')), 'family':fontspec.attrib.get('family'), 'color':fontspec.attrib.get('color')} fontspecs[id] = fontdesc styles['div.fontspec-%s' % id] = 'color:%s;font-family:%s;font-size:%dpx' % (fontdesc['color'], fontdesc['family'], fontdesc['size']) # Output the view, with instructions for the user. print '<html dir="ltr" lang="en">' print '<head>' print ' <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>' print ' <title>PDF to XML text positioning</title>' print ' <style type="text/css" media="screen">%s</style>' % "\n".join([ "%s { %s }" % (k, v) for k, v in styles.items() ]) print ' <script type="text/javascript" src="http://ajax.googleapis.com/ajax/libs/jquery/1.4.2/jquery.min.js"></script>' print ' <script>%s</script>' % jscript print '</head>' print '<div class="info" id="info1"><text block></div>' print '<div class="info" id="info2"><position></div>' print '<div class="heading">' print '<h2>Graphical preview of scraperwiki.pdftoxml(pdfdata)</h2>' print '<p>Click on a text line to see its coordinates and any other text that shares the same column or row.' print ' Useful for discovering what coordinates to use when extracting rows from tables in a document.</p>' print '<p>To do: track the coordinates of the mouse and cross reference with <a href="/cropper">cropper</a> technology.</p>' print '<p class="href"><a href="%s">%s</a></p>'% (pdfurl, pdfurl) print '<form id="newpdfdoclink">' print ' Another PDF link:' print ' <input type="text" name="url" value="" title="paste in url of new document">' print ' <input type="submit" value="Go">' print '</form>' ttx = re.sub('<', '<', pdfxml) ttx = re.sub('\n', '\r\n', ttx) print '<textarea class="pdfprev">%s</textarea>' % ttx[:5000] print '</div>' print '<p>There are %d pages</p>' % len(root) # Print each page of the PDF. for index, page in enumerate(root): print Pageblock(page, index)
def parsepdf(pdfurl): a = scraperwiki.scrape(pdfurl) s = BeautifulSoup(scraperwiki.pdftoxml(a)) kobcine = {} for t in s.findAll('text'): if t.text != " ": ko_ime = find_ko(t.text) if ko_ime: ko = kobcine.setdefault(ko_ime, 0) kobcine[ko_ime] = ko + 1 return kobcine
def paginating(url): pages = range(13, 276, 2) pdf_data = urllib2.urlopen(url).read() xml_data = sw.pdftoxml(pdf_data) html_data = html.fromstring(xml_data) for page in pages: page_data = html_data.cssselect('page')[page] print html.tostring(page_data) parse_pdf_header(page_data)
def pink_pages(): # # Pink pages # print "Loading pink pages (station to RP mapping)" url = "http://www.atoc.org/clientfiles/File/routeing_point_identifier.pdf" pdfdata = urllib2.urlopen(url).read() print "The pdf file has %d bytes" % len(pdfdata) print "Converting to XML" xmldata = scraperwiki.pdftoxml(pdfdata) # print "After converting to xml it has %d bytes" % len(xmldata) # print "The first 20000 characters are: ", xmldata[:20000] print "Parsing XML" root = lxml.etree.fromstring(xmldata) # Each station is on a single line consisting of the station name and then # the various routeing points. stncells = root.xpath('//text[@left=37]') scraperwiki.sqlite.execute('DROP TABLE IF EXISTS routeing_points') scraperwiki.sqlite.execute('CREATE TABLE routeing_points (station, routeing_point)') scraperwiki.sqlite.execute('DROP TABLE IF EXISTS groups') scraperwiki.sqlite.execute('CREATE TABLE groups (station, stngroup)') find_other_cells = lxml.etree.XPath('following-sibling::text[@top=$this/@top]') print "Extracting station list" for stncell in stncells: # Find other cells on the same row of the same page. othercells = find_other_cells(stncell, this = stncell) for othercell in othercells: stn, other = stncell.xpath('string()'), othercell.xpath('string()') stn = stn.title() if other == "Routeing Point": other = stn if other.endswith(" Routeing Point Member"): other = other[:-22] scraperwiki.sqlite.execute('INSERT INTO groups VALUES (?, ?)', (str(stn), str(other)), verbose=0) scraperwiki.sqlite.execute('INSERT INTO routeing_points VALUES (?, ?)', (str(stn), str(other)), verbose=0) print "Creating indexes" scraperwiki.sqlite.execute('CREATE INDEX points_bystn ON routeing_points(station)') scraperwiki.sqlite.execute('CREATE INDEX groups_bystn ON groups(station)') print "Committing" scraperwiki.sqlite.commit() print "Pink pages processed"
def main(): link_src = \ '''https://api.scraperwiki.com/api/1.0/datastore/sqlite?format=jsondict&name=malaysian_parliament_hansard_url&query=select%20*%20from%20swdata%20limit%2010''' links = urllib2.urlopen(link_src) links_data = json.load(links) pdf_url = links_data[2]['url'].replace(' ','%20') print pdf_url pdf_data = urllib2.urlopen(pdf_url).read() xml_data = scraperwiki.pdftoxml(pdf_data) xml_data = xml_data.replace('<b>','').replace('</b>','') print xml_data root = etree.fromstring(xml_data)
def parse_pdf(url, name, page_url): url = url.encode('ascii') name = name.encode('utf-8') print name pdf_url = "http://www.has-sante.fr/portail/" + url avis = " " avis2 = " " #follows the first link a = scraperwiki.scrape(pdf_url) a = a.lower() #finds the actual link (there's a redirect) soup = BeautifulSoup(a) pdf_url = soup.find("meta") pdf_url = pdf_url['content'] pdf_url = pdf_url.replace("0; url='../../../../", "http://www.has-sante.fr/portail/") pdf_url = pdf_url[:-1] pdf_url = pdf_url.encode('ascii') #now for the real pdf try: b = scraperwiki.scrape(pdf_url) s = BeautifulSoup(scraperwiki.pdftoxml(b)) #some basic regex to extract meaningful info for t in s.findAll('text'): if t.text != " ": pattern = '^.*?int.r.t de sant. publique.*?faible.*?$' pattern2 = '^.*?service m.dical rendu par.*?$' if (re.search(pattern, t.text)): avis = t.text avis = avis.encode('utf-8') print avis elif(re.search(pattern2, t.text)): avis2 = t.text avis2 = avis2.encode('utf-8') print avis2 #now we've got everything, we're adding it to the DB data = {} medoc_name = name data['Name'] = medoc_name data['pdf_url'] = pdf_url data['page_url'] = page_url data['interet_sante'] = avis + "\n" + avis2 data[medoc_name] = medoc_name scraperwiki.datastore.save(['Name'], data) except: print "Error" + pdf_url
def process_pdf(pdfurl): # (harder example to work on: http://www.nihe.gov.uk/schemes_accepted_010109_to_310309.pdf ) pdfdata = urllib.urlopen(pdfurl).read() pdfxml = scraperwiki.pdftoxml(pdfdata) s = BeautifulSoup(pdfxml) entrylines = [] for text in s.findAll('text'): #print text entrylines.append(text) left = int(text['left']) if 82 == left: periode = text.text innut, m, y = periode.split(" ") if 107 == left: kapittel = text.text if 163 == left: post = text.text if 704 <= left and left <= 782: overfraifjor = text.text if 822 <= left and left <= 867: bevilgning = text.text if 920 <= left and left <= 965: samlbevilgning = text.text if 1011 <= left and left <= 1056: regnskap = text.text if 1124 <= left and left <= 1156 and u"1000 kr" != text.text.strip() and post is not None: rest = text.text if overfraifjor is None or bevilgning is None or samlbevilgning is None or regnskap is None or rest is None: error(entrylines, kapittel, post, overfraifjor, bevilgning, samlbevilgning, regnskap, rest) data = { 'periode' : periode, 'year' : y, 'month' : m, 'type' : innut, 'kapittel' : kapittel, 'post' : post, 'overfraifjor' : valstr2int(overfraifjor), 'bevilgning' : valstr2int(bevilgning), 'samlbevilgning' : valstr2int(samlbevilgning), 'regnskap' : valstr2int(regnskap), 'rest' : valstr2int(rest), } #print data #time.sleep(1) scraperwiki.sqlite.save(unique_keys=['periode', 'kapittel', 'post'], data=data) post = None overfraifjor = None bevilgning = None samlbevilgning = None regnskap = None rest = None entrylines = []
def scrape_and_look_for_next_link(url): html = scraperwiki.scrape(url) soup = BeautifulSoup(html) find_link = soup.findAll(href=re.compile("/downloads/pdf/crime_statistics/")) next_link = [None]*(len(find_link)) rep_link = [None]*(len(find_link)) for i in range(len(find_link)): next_link[i] = find_link[i]['href'] rep_link[i] = next_link[i].replace('../..','http://www.nyc.gov/html/nypd') for i in range(len(rep_link)): a = scraperwiki.scrape(rep_link[i]) #here I call my previously defined function to convert and scrape the pdf soup_pdf = BeautifulSoup(scraperwiki.pdftoxml(a)) scrape_table(soup_pdf)
def extract_birth(cvfile): '''Extract birth year from PDF file with CV.''' pdf = open(CVFILE, 'rb') xml = scraperwiki.pdftoxml(pdf.read()) root = lxml.etree.fromstring(xml) birthstr = root.xpath('//text[@top="320"]')[0].text mf = 'M' if birthstr[3] == 'o' else 'F' birthyear = birthstr[-4:] print(mf, birthyear)
def pdf_scrape(pdf, directory): '''Convert pdf to xml''' with open("pdf/" + directory + "/" + pdf) as u: xml = pdftoxml(u.read()) if not os.path.exists("xml"): os.mkdir("xml") with open("xml/" + pdf + ".xml", "w") as w: w.write(xml) return xml
def main(): url="http://governor.ny.gov/citizenconnects/assets/document/CitizenConnectsdoc.pdf" pdfdata = urllib2.urlopen(url).read() xmldata = scraperwiki.pdftoxml(pdfdata) rootdata=lxml.etree.fromstring(xmldata) pages = list(rootdata) # print "The pages are numbered:", [ page.attrib.get("number") for page in pages ] for page in pages: entries = getText(page) store(entries)
def Main(pdfurl): ''' Take the URL of a PDF, and use scraperwiki.pdftoxml and lxml to output the contents as a styled HTML div. ''' pdfdata = urllib2.urlopen(pdfurl).read() pdfxml = scraperwiki.pdftoxml(pdfdata) try: root = lxml.etree.fromstring(pdfxml) except lxml.etree.XMLSyntaxError, e: print str(e), str(type(e)).replace("<", "<") print pdfurl print pdfxml.replace("<", "<") root = []
def scrape_pdf(url): ''' Scrape data from PDF at URL. ''' try: pdf_data = urlopen(url).read() except: return "#MISSING!" pdf_xml = scraperwiki.pdftoxml(pdf_data) root = lxml.etree.fromstring(pdf_xml) #pages = list(root) full_text = get_pdf_text(root) return full_text
def process_pdf(pdfurl): pdfxml = u.findInCache(pdfurl, verbose=True) # look for html parse in cache if pdfxml is None: # a html parse is not cached pdfdata = lazycache.lazycache( pdfurl, verbose=True) # look for pdf document in cache, if not download pdfxml = scraperwiki.pdftoxml(pdfdata, "-hidden") # parse pdf text to html u.putInCache(pdfurl, pdfxml, verbose=True) # save cache of html parse beautifulxml = BeautifulSoup( pdfxml) # convert html to BeautifulSoup(4) object for page in beautifulxml.find_all('page'): FIRSTPAGE = 6 LASTPAGE = 6 if int(page['number']) < FIRSTPAGE: continue if int(page['number']) == FIRSTPAGE: print "*******************************************" print "***** FIRSTPAGE #%d while developing ******" % (FIRSTPAGE) print "*******************************************" if int(page['number']) == LASTPAGE + 1: print "*******************************************" print "****** LASTPAGE #%d while developing ******" % (LASTPAGE) print "*******************************************" break print("*******************************************") print("********** Working on page #%s **********" % page['number']) print("*******************************************") elementList = deque( page.find_all('text')) # we want to be able to use popleft d(elementList) while True: try: currElement = elementList.popleft() if "Innhold:" in currElement.text and currElement.b: # we found a "Innhold:"-header entry = parseDocumentRecord(currElement, elementList) print entry scraperwiki.sqlite.save( unique_keys=["innhold", "sakstittel"], data=entry) d("back in process_pdf") #else: #print currElement.text except IndexError, e: d("No more text elements on page (%s)" % e) break
def Main(url): tmpfile = tempfile.gettempdir() + "/45_networkrail.zip" tmpdir = tempfile.gettempdir() + "/45_networkrail" #+str(random.randint(2, 1000000000)); urllib.urlretrieve(url, tmpfile) with zipfile.ZipFile(tmpfile, 'r') as myzip: myzip.extractall(tmpdir) f = open(tmpdir + "/completeTimetable.pdf", 'r') pdfxml = scraperwiki.pdftoxml(f.read()) #print(os.listdir(tmpdir)); #print(pdfxml); root = lxml.etree.fromstring(pdfxml) print '<p>There are %d pages</p>' % len(root) print etree.tostring(root[0])
def scrapepdf(url): #use the urllib2 library's .urlopen function to open the full PDF URL, and the .read() function to read it into a new object, 'pdfdata' pdfdata = urllib2.urlopen(url).read() #use pdftoxml to convert that into an xml document pdfread = scraperwiki.pdftoxml(pdfdata) print pdfread #use lxml.etree to convert that into an lxml object pdfroot = lxml.etree.fromstring(pdfread) #find all <text> tags and put in list variable 'lines' lines = pdfroot.findall('.//text') #create variable 'linenumber', initialised at 0 linenumber = 0 record = {} #loop through each item in 'lines' list for line in lines: #add one to 'linenumber' so we can track which line we're dealing with linenumber = linenumber + 1 #if 'line' has some text: if line.text is not None: #create a new variable 'mention' that is filled with the result of #using the 're' library's .match function mention = re.search(r'.*black.*', line.text) if mention: print line.text #the RANGE function generates a list from the first parameter to the second, #e.g. range(5,8) would make [5, 6, 7] - it doesn't include the 'end' of the range #in this case we're using the line number minus 2, and the linenumber as our start and end points print range(linenumber - 2, linenumber + 1) linebefore = "EMPTY LINE" lineafter = "EMPTY LINE" incontextlist = [] if pdfroot.xpath('.//text')[linenumber - 2].text: linebefore = pdfroot.xpath('.//text')[linenumber - 2].text incontextlist.append(linebefore) incontextlist.append( pdfroot.xpath('.//text')[linenumber - 1].text) if pdfroot.xpath('.//text')[linenumber].text is not None: lineafter = pdfroot.xpath('.//text')[linenumber].text incontextlist.append(lineafter) print "mention.group()", mention.group() print "CAN YOU SEE ME?", ''.join(incontextlist) record["mention in context"] = ''.join(incontextlist) record["linenumber"] = linenumber #this stores the 'url' variable which is passed right at the start of this function: def scrapepdf(url): record["url"] = url print record scraperwiki.sqlite.save(["linenumber", "url"], record)
def scrape_and_look_for_next_link(url): html = scraperwiki.scrape(url) soup = BeautifulSoup(html) find_link = soup.findAll( href=re.compile("/downloads/pdf/crime_statistics/")) next_link = [None] * (len(find_link)) rep_link = [None] * (len(find_link)) for i in range(len(find_link)): next_link[i] = find_link[i]['href'] rep_link[i] = next_link[i].replace('../..', 'http://www.nyc.gov/html/nypd') for i in range(len(rep_link)): a = scraperwiki.scrape( rep_link[i] ) #here I call my previously defined function to convert and scrape the pdf soup_pdf = BeautifulSoup(scraperwiki.pdftoxml(a)) scrape_table(soup_pdf)
def ExtractPdf(year, nz, pdfbin, lurl): mnz = re.match("(...).*?(?:\d\d)?(\d\d)?_3.pdf", nz) assert mnz, nz assert mnz.group(1).lower() in m3, nz dnz = "%d-%02d" % (mnz.group(2) and int(mnz.group(2)) + 2000 or int(year), m3.index(mnz.group(1).lower()) + 1) #print "date", dnz root = lxml.etree.fromstring(scraperwiki.pdftoxml(pdfbin)) currentcountry = None currentmission = None ldata = [] data = None for page in list(root): rtblocks = [] #print lxml.etree.tostring(page) for text in page: if text.tag != "text": continue if 130 <= int(text.attrib.get("left")) <= 140: #print lxml.etree.tostring(text) currentmission = None currentcountry = text_content(text).strip() if 276 <= int(text.attrib.get("left")) <= 280: if rtblocks and data: lndata = parsemissionblock(rtblocks, data) ldata.extend(lndata) currentmission = text_content(text).strip() data = { "link": lurl, "nz": nz, "month": dnz, "country": currentcountry, "mission": currentmission, "year": year } rtblocks = [] if int(text.attrib.get("left")) > 350: rtblocks.append(text) if rtblocks and data: lndata = parsemissionblock(rtblocks, data) ldata.extend(lndata) scraperwiki.sqlite.save(["month", "country", "mission", "desc"], ldata) return dnz, len(ldata)
def scrape_pdf(url): ''' Scrape data from PDF at URL. ''' try: pdf_data = urlopen(url).read() except: return (None, None, None) pdf_xml = scraperwiki.pdftoxml(pdf_data) root = lxml.etree.fromstring(pdf_xml) page0 = root.find('page') try: content = dict(parse(list(tokenize(page0)))) except ParseError: content = None full_text = get_pdf_text(root) return pdf_xml, full_text, content
def scrapepdf(pdfurl): #print "scraping " + pdfurl pdfdata = urllib.urlopen(pdfurl).read() pdfxml = scraperwiki.pdftoxml(pdfdata) s = BeautifulSoup(pdfxml) #print s casenr = None datestr = None daynr = None last_line = "" for idx, text in enumerate(s.findAll('text')): msg = text.text #print msg if 0 == msg.find(u"Møte "): datestr = datestr2date(msg.split("den ")[1].split(" kl.")[0]) #print datestr if 0 == msg.find("D a g s o r d e n (nr."): daynr = msg.split(")")[0].split(".")[1] continue if -1 != msg.find("Votering i sak nr."): #print msg casenr = msg.split("nr.")[1].strip() continue elif -1 != msg.find("Votering i sak "): #print msg casenr = msg.split("i sak ")[1].strip() continue if -1 != msg.find("enstemmig bifalt") or -1 != msg.find( "enstemmig vedtatt") or ( (-1 != msg.find("bifalt") or -1 != msg.find("vedtatt")) and -1 != last_line.find("ble enstemmig")): #print datestr, daynr, casenr, msg data = { 'index': idx, 'date': datestr, 'daynr': daynr, 'casenum': casenr, 'msg': last_line + msg, } if casenr is not None: scraperwiki.sqlite.save( unique_keys=['date', 'casenum', 'index'], data=data) last_line = msg
def maps(): url = "http://www.atoc.org/clientfiles/File/Maps.pdf" print "Fetching maps" pdfdata = urllib2.urlopen(url).read() print "The pdf file has %d bytes" % len(pdfdata) print "Converting to XML" xmldata = scraperwiki.pdftoxml(pdfdata) #print "After converting to xml it has %d bytes" % len(xmldata) #print "The first 20000 characters are: ", xmldata[:20000] print "Converting PDF to PNGs" with tempfile.NamedTemporaryFile() as pdffile: pdffile.write(pdfdata) pdffile.flush() tmpdir = tempfile.mkdtemp() subprocess.check_call(['pdftoppm', '-r', '75', '-png', pdffile.name, os.path.join(tmpdir, 'p')]) print "Parsing XML" root = lxml.etree.fromstring(xmldata) print "Processing maps" maptitles = root.xpath('//text[@height=100]') scraperwiki.sqlite.execute('DROP TABLE IF EXISTS maps') scraperwiki.sqlite.execute('CREATE TABLE maps (mapname, pageno, data)') for maptitle in maptitles: pageno = int(maptitle.xpath('string(../@number)')) with open(os.path.join(tmpdir, 'p-%03d.png' % (pageno)), 'rb') as f: scraperwiki.sqlite.execute('INSERT INTO maps VALUES (?,?,?)', (maptitle.xpath('string()'), pageno, base64.b64encode(f.read()))) print "Creating index" scraperwiki.sqlite.execute('CREATE INDEX maps_bymap ON maps(mapname, pageno)') print "Committing maps" scraperwiki.sqlite.commit() print "Maps processed"
def yellow_pages(): # # Yellow pages # # This file is huge, so we do the XML parsing incrementally. # print "Loading yellow pages (permitted route list)" url = "http://www.atoc.org/clientfiles/File/permitted_route_identifier.pdf" pdfdata = urllib2.urlopen(url).read() print "The pdf file has %d bytes" % len(pdfdata) print "Converting to XML" xmldata = scraperwiki.pdftoxml(pdfdata) # print "After converting to xml it has %d bytes" % len(xmldata) # print "The first 20000 characters are: ", xmldata[:20000] orig = None dest = None scraperwiki.sqlite.execute('DROP TABLE IF EXISTS permitted_routes') scraperwiki.sqlite.execute('CREATE TABLE permitted_routes (orig, dest, maps)') print "Processing XML" # This is horrid, and assumes that the PDF will be in the correct order. for _, cell in lxml.etree.iterparse(StringIO.StringIO(xmldata), tag='text'): if cell.attrib['height'] == '10': if cell.attrib['left'] == '80': orig = cell.xpath('string()') elif cell.attrib['left'] == '208': dest = cell.xpath('string()') else: scraperwiki.sqlite.execute('INSERT INTO permitted_routes VALUES (?, ?, ?)', (orig, dest, cell.xpath('string()'))) cell.clear() print "Creating index" scraperwiki.sqlite.execute('CREATE INDEX routes_bystn ON permitted_routes(orig, dest)') print "Committing" scraperwiki.sqlite.commit() print "Yellow pages done"
def process_pdf( url ): print "PROCESSING: " , url, pdfdata = urllib2.urlopen(url).read() print len( pdfdata ), "bytes" if len(pdfdata) > 50000: return "" #too BIG Daddio! str = '' xmldata = scraperwiki.pdftoxml(pdfdata) root = lxml.etree.fromstring(xmldata) pages = list(root) def gettext_with_bi_tags(el): res = [ ] if el.text: res.append(el.text) for lel in el: res.append("<%s>" % lel.tag) res.append(gettext_with_bi_tags(lel)) res.append("</%s>" % lel.tag) if el.tail: res.append(el.tail) return "".join(res) for page in pages : print page.attrib.get("number") # print the first hundred text elements from the first page page0 = pages[0] i = [] data = [] for el in list(page)[:1000]: if el.tag == "text": data = {} text = strip_tags( gettext_with_bi_tags(el) ) #data['text'] = text #data['url'] = url # The source of these words if text != '' and text != ' ': #scraperwiki.sqlite.save(i, data) str += " " + text return str
def getheadingsfrompdf(pdfurl): pdfdata = urllib2.urlopen(pdfurl).read() pdfxml = scraperwiki.pdftoxml(pdfdata) root = lxml.etree.fromstring(pdfxml) ldata = [] for page in root: for el in page: # needs also to do concatenation between headings that run to two lines, # and handle headings with italics in them <i> if el.tag == "text" and el.attrib.get("font") == "10" and len( el) == 1 and el[0].tag == "b": data = { "pdfurl": pdfurl, "pagenumber": int(page.attrib.get("number")), "heading": el[0].text } ldata.append(data) scraperwiki.sqlite.save(["pdfurl", "pagenumber", "heading"], ldata, "subheadings")
def scrape(): u = file("cho-1-elementary.pdf") x = scraperwiki.pdftoxml(u.read()) soup = BeautifulSoup(x) book = soup.get_text().split('\n') page = [] newpage = [] for x in book[36:]: newpage += [x] if x == '': pass elif x[0] == 'p': page += [newpage] newpage = [] elif x[0] == '<': newpage = [x] return page
def iter_areas(): import scraperwiki import StringIO pdfurl = "http://www.appc.org.uk/appc/filemanager/root/site_assets/pdfs/appc_register_entry_for_1_december_2009_to_28_february_2010.pdf" pdf = scraperwiki.scrape(pdfurl) print "Converting pdf to xml" xml = scraperwiki.pdftoxml(pdf) print "got xml" xmlfd = StringIO.StringIO(xml) doc = PdfToHTMLOutputParser(xmlfd) print "got doc" #import sys #doc = PdfToHTMLOutputParser(open(sys.argv[1])) org = {} grouper = TextGrouper() grouper.add_patterns( (re.compile("APPC register entry ", re.IGNORECASE), "dates"), ("Address(es) in UK", "address"), ("Address in UK", "address"), ("Contact", "contact"), ("Offices outside UK", "section"), (re.compile("providing PA consultancy services", re.IGNORECASE), "section"), (re.compile("clients for whom", re.IGNORECASE), "section"), ) def font_0(item): if item.fontspec.number == 0: item.props['type'] = 'name' item.props['grabbottom'] = 20 print "Marked title:", repr(item.text) grouper.special_fns.append(font_0) grouper.group(doc.text(merge_verticals=True)) #grouper.display() #grouper.display_full() for area in grouper.areas: yield area
def fetch_record(url): f = requests.get(url) pdf = scraperwiki.pdftoxml(f.content) root = lxml.etree.fromstring(pdf) texts = root.xpath("//text") rows = {} for text in texts: top = int(text.xpath("./@top")[0]) / 10 * 10 left = int(text.xpath("./@left")[0]) value = text.text.strip() if top not in rows: rows[top] = [] rows[top].append(value) rows_sorted = [rows[key] for key in sorted(rows.keys())] first_row = rows_sorted[0][0] words = first_row.split(" ") month = list(calendar.month_name).index(words[-2]) if month == 0: raise Exception("Cannot parse month") year = int(words[-1]) rows_sorted = rows_sorted[1:-1] header = [convert(s) for s in rows_sorted[0]] num_rows = len(header) for i in range(1, len(rows_sorted)): row = rows_sorted[i] k = num_rows - len(row) if k > 0: padding = [""] * k rows_sorted[i] = padding + rows_sorted[i] for i in range(1, len(rows_sorted)): if len(rows_sorted[i][0]) == 0: rows_sorted[i][0] = rows_sorted[i - 1][0] d = {"year": year, "month": month} for j in range(0, len(header)): d[header[j]] = rows_sorted[i][j] scraperwiki.sqlite.save(unique_keys=['year', 'month', 'district'], data=d) print d
def scrape_cieisp(year, text): if (year == 2010): pdfurl = "http://www.secretariadoejecutivosnsp.gob.mx/work/models/SecretariadoEjecutivo/Resource/131/1/images/INCIDENCIA_DELICTIVA_2010_030211.pdf" else: pdfurl = "http://www.secretariadoejecutivosnsp.gob.mx/work/models/SecretariadoEjecutivo/Resource/131/1/images/CIEISP" + ` year ` + ".pdf" a = scraperwiki.scrape(pdfurl) s = BeautifulSoup(scraperwiki.pdftoxml(a)) dolosos_position = [] i = 0 for t in s.findAll('text'): if t.text == "DOLOSOS": if text == "POR ARMA DE FUEGO": dolosos_position.append(i + 14) else: dolosos_position.append(i) i += 1 all_text = s.findAll('text') #print all_text if (year <= 2008): if (year >= 2006): states_names = states3 else: states_names = states2 else: states_names = states for i in range(0, 33): for j in range(1, 14): record = { 'State': states_names[i], 'Year': year, 'Month': months[j - 1], 'Homicides': all_text[dolosos_position[i] + j].text, 'Crimetype': text } scraperwiki.datastore.save(["State", "Year", "Month"], record) return
def get_id_period (self, date): from_iso_dt, to_iso_dt = util.inc_dt(date.strftime(util.ISO8601_DATE), util.ISO8601_DATE, self.PERIOD_TYPE) from_dt = util.get_dt(from_iso_dt, util.ISO8601_DATE) to_dt = util.get_dt(to_iso_dt, util.ISO8601_DATE) url_date = to_dt.strftime(self.search_url) if self.DEBUG: print url_date try: response = self.br.open(url_date) except: response = None final_result = [] if response: pdfxml = scraperwiki.pdftoxml(response.read()) if self.DEBUG: print pdfxml url = response.geturl() result = scrapemark.scrape(self.scrape_ids1, pdfxml, url) if not result or not result.get('records'): result = scrapemark.scrape(self.scrape_ids2, pdfxml, url) if result and result.get('records'): for rec in result['records']: rec['url'] = url_date rec['start_date'] = rec['date_received'] try: map_ref_list = rec['os_map_ref'].split() rec['easting'] = map_ref_list[0] rec['northing'] = map_ref_list[1] del rec['os_map_ref'] except: pass self.clean_ids(result['records']) for rec in result['records']: # note do this after record cleaning rec['date_scraped'] = datetime.now().strftime('%Y-%m-%dT%H:%M:%S') final_result.extend(result['records']) #else: # return [], None, None return final_result, from_dt, to_dt # note weekly result might some times be legitimately empty
def Main(pdfurl): ''' Take the URL of a PDF, and use scraperwiki.pdftoxml and lxml to output the contents as a styled HTML div. ''' pdfdata = urllib2.urlopen(pdfurl).read() pdfxml = scraperwiki.pdftoxml(pdfdata) root = lxml.etree.fromstring(pdfxml) global styles fontspecs = {} # Get the PDF's internal styles: we'll use these to style the divs containing the PDF. for fontspec in root.xpath('page/fontspec'): id = fontspec.attrib.get('id') fontdesc = { 'size': int(fontspec.attrib.get('size')), 'family': fontspec.attrib.get('family'), 'color': fontspec.attrib.get('color') } fontspecs[id] = fontdesc styles['div.fontspec-%s' % id] = 'color:%s;font-family:%s;font-size:%dpx' % ( fontdesc['color'], fontdesc['family'], fontdesc['size']) # Output the view, with instructions for the user. print '<html dir="ltr" lang="en">' print '<head>' print ' <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>' print ' <title>PDF to XML text positioning</title>' print ' <style type="text/css" media="screen">%s</style>' % "\n".join( ["%s { %s }" % (k, v) for k, v in styles.items()]) print ' <script type="text/javascript" src="http://ajax.googleapis.com/ajax/libs/jquery/1.4.2/jquery.min.js"></script>' print ' <script>%s</script>' % jscript print '</head>' # Print each page of the PDF. for index, page in enumerate(root): print Pageblock(page, index)
def scrapereport(reportlink): boldline = 0 html = scraperwiki.scrape(baseurl + reportlink) root = lxml.html.fromstring(html) links = root.cssselect("div#unusefulbottom a") #<div id="unusefulbottom"> for link in links: print "LINK GRABBED WITH CSSSELECT", link print "link.attrib.get", link.attrib.get('href') downloadlink = link.attrib.get('href') # print " downloadlink[0].text_content()", downloadlink[0].text_content() pdfdata = urllib2.urlopen(baseurl + downloadlink).read() print "pdfdata", pdfdata xmldata = scraperwiki.pdftoxml(pdfdata) print "xmldata", xmldata pdfxml = lxml.etree.fromstring(xmldata) print "pdfxml", pdfxml boldtags = pdfxml.xpath('.//text') linenumber = 0 for heading in boldtags: linenumber = linenumber + 1 #print "Heading:", heading.text if heading.text is not None: # mention = re.match(r'.*NMS.*',heading.text) mention = re.match(r'.*overall.*', heading.text) if mention: print "FULL LINE", lxml.etree.tostring(heading, encoding="unicode", method="text") # print "OVERALL", heading.text # print "CHECK", pdfxml.xpath('.//text')[linenumber-1].text # print "LINEAFTER", pdfxml.xpath('.//text')[linenumber].text record['overall'] = lxml.etree.tostring(heading, encoding="unicode", method="text") record['uniqueref'] = reportlink + "_" + str(boldline) record['downloadlink'] = baseurl + downloadlink scraperwiki.sqlite.save(['uniqueref'], record)
def carregaPagina(url): print url pdfdata = urllib2.urlopen(url).read() #print "The pdf file has %d bytes" % len(pdfdata) xmldata = scraperwiki.pdftoxml(pdfdata) #print "After converting to xml it has %d bytes" % len(xmldata) print "The first 5000 characters are: ", xmldata[:5000] root = lxml.etree.fromstring(xmldata) pages = list(root) print "The pages are numbered:", [ page.attrib.get("number") for page in pages ] arquivo = url.replace("https://www.fazenda.sp.gov.br/SigeoLei131/Paginas/Arquivos/", "") cabecalho = True for page in pages: #[:1] data = {} conta = 0 coluna = 0 for el in list(page)[:110]: #[:100] if el.tag == "text": data[colunas[coluna]] = el.text.strip() coluna = coluna + 1 if coluna >= len(colunas): if not cabecalho: data['arquivo'] = arquivo #print el.attrib['left'], el.text if conta < 10: print data scraperwiki.datastore.save(["arquivo", "nome", "cargo", "municipio"], data) conta = conta + 1 data = {} cabecalho = False coluna = 0 print "Pagina %s: %s registro(s)" % (page.attrib.get("number"), conta)
def main(): url="http://www.freedomhouse.org/sites/default/files/Freedom%20OnThe%20Net_Full%20Report.pdf" pdfdata = urllib2.urlopen(url).read() xmldata = scraperwiki.pdftoxml(pdfdata) goodpages=[27,28,29] rootdata=lxml.etree.fromstring(xmldata) pages = list(rootdata) page=pages[23] alltext=getText(page) dict1(alltext) for i in goodpages: page=pages[i] alltext=getText(page) dict2(alltext) pagenumbers=[30,36,47,53, 57, 61, 65, 72, 78, 82, 87, 97, 102, 108] Country=['Brazil', 'China', 'Cuba', 'Egypt', 'Estonia', 'Georgia', 'India', 'Iran', 'Kenya', 'Malaysia','Russia','Tunisia', 'Turkey', 'UK'] for i in range(len(pagenumbers)): page=pages[pagenumbers[i]] alltext=getText(page) PageInfo(alltext, Country[i])
import scraperwiki import urllib2 import lxml.etree import bs4 url = "http://dget.nic.in/ItiUpgradePPP/list%20of%20%20ITIs%20only%20wth%20industry%20partners10-11.pdf" pdfdata = urllib2.urlopen(url).read() #print "The pdf file has %d bytes" % len(pdfdata) xmldata = scraperwiki.pdftoxml(pdfdata) #print "After converting to xml it has %d bytes" % len(xmldata) #print "The first 2000 characters are: ", xmldata[:2000] root = lxml.etree.fromstring(xmldata) print xmldata soup = bs4.BeautifulSoup(xmldata) #print soup start = False ITI = True sl_no = 0 for link in soup.find_all('text'): #print link.textcontent() #print link.get_text() #print str(start) text = link.get_text() text = text.replace(',', ' ') if start: if len(text) > 4 or text.count('NIL') > 0: #print text #if text.count('(ITI-')>0: #continue
geocode = simplejson.loads(geo_response.read()) print geocode_url print geocode #Google imposes query limits, this lets us pass a failure and have the loop sleep and try again after 2 seconds if geocode['status'] == "OVER_QUERY_LIMIT": return 0 if geocode['status'] != 'ZERO_RESULTS': coord_lat = geocode['results'][0]['geometry']['location']['lat'] coord_lon = geocode['results'][0]['geometry']['location']['lng'] coord.append(coord_lat) coord.append(coord_lon) print coord return coord url = "https://www.denvergov.org/Portals/707/documents/mydenverdrive/1-22-25-2013.pdf" xml = scraperwiki.pdftoxml(urllib2.urlopen(url).read()) parsed = BeautifulSoup(xml).text.split("\n") filtered_list = parsed[parsed.index('Location: '):] closures = [] i = 0 current_closure = -1 while i < len(filtered_list): text = filtered_list[i] if text == "Location: ": closures.append({}) current_closure = len(closures) - 1 i += 1 closures[current_closure]['location'] = filtered_list[i] #print filtered_list[i]
# Blank Python import sys import scraperwiki import urllib import lxml.etree, lxml.html import re # for the geocode from geopy import geocoders import json pdfurl = "http://www.nikebiz.com/responsibility/documents/factory_disclosure_list.pdf" pdfdata = urllib.urlopen(pdfurl).read() pdfxml = scraperwiki.pdftoxml(pdfdata) root = lxml.etree.fromstring(pdfxml) g = geocoders.Google( 'ABQIAAAAJWpc-texCflE7mMP0dgMGRTudD1_fegkcYIvU14JimqYoyT2khRxYTlCvIBPJApaoqvk4JfEfbrhyg' ) for page in root: assert page.tag == 'page' #print "page details", page.attrib pagelines = {} pagedata = {} for v in page: if v.tag == 'text': text = re.match( '(?s)<text.*?>(.*?)</text>', lxml.etree.tostring(v)).group(
from urllib2 import urlopen from lxml.html import fromstring, tostring import lxml.etree def get_pdf_list(): raw = urlopen('http://www.dropbox.com/sh/gpi0ejooop07x8a/bMDz4s9Ixp').read() html = fromstring(raw) a_elements = html.cssselect('li.browse-file.list-view-cols div.filename-col a') pdf_urls = [a.attrib['href'] + '?dl=1' for a in a_elements] return pdf_urls test_url = 'https://www.dropbox.com/sh/gpi0ejooop07x8a/qJxWkjx8fz/ENDA-HR1858-June1997.pdf?dl=1' raw_pdf = urlopen(test_url).read() pdfxml = lxml.etree.fromstring(pdftoxml(raw_pdf)) rawtext = pdfxml.xpath('string()').replace('\n', ' ') print rawtextfrom scraperwiki import pdftoxml from urllib2 import urlopen from lxml.html import fromstring, tostring import lxml.etree def get_pdf_list(): raw = urlopen('http://www.dropbox.com/sh/gpi0ejooop07x8a/bMDz4s9Ixp').read() html = fromstring(raw) a_elements = html.cssselect('li.browse-file.list-view-cols div.filename-col a') pdf_urls = [a.attrib['href'] + '?dl=1' for a in a_elements] return pdf_urls
def get_root(filename): f=open(filename,"rb") return lxml.html.fromstring(scraperwiki.pdftoxml("".join(f)))
import scraperwiki from BeautifulSoup import BeautifulSoup import time import urllib urltemplate = 'http://www.fco.gov.uk/resources/en/protocol/ldl-' date = time.strftime('%A %d %B %Y') month = (date.split(' '))[2] year = (date.split(' '))[3] #url = urltemplate + month + year url = 'http://www.fco.gov.uk/resources/en/protocol/ldl-August2010' pdfinput = urllib.urlopen(url) print 'got pdf' scraped = scraperwiki.pdftoxml(pdfinput.read()) print 'pdftohtml complete' output = [] def getlastrow(): l = len(output) r = output[(l)] return r print 'finished setup' soup = BeautifulSoup(scraped) print 'soup cooked' # this document is a right dog