Python pdftoxml示例，scraperwiki.pdftoxml Python示例

示例#1

0

显示文件

文件： scraper.py 项目： bjh21/atoc_routeing_guide

def rp_table():
    url = "http://www.atoc.org/clientfiles/File/routeing_points.pdf"

    print "Processing routeing point list"
    pdfdata = urllib2.urlopen(url).read()
    print "The pdf file has %d bytes" % len(pdfdata)

    print "Converting to XML"
    xmldata = scraperwiki.pdftoxml(pdfdata)

    #print "After converting to xml it has %d bytes" % len(xmldata)
    #print "The first 20000 characters are: ", xmldata[:20000]

    scraperwiki.sqlite.execute('DROP TABLE IF EXISTS rp_maps')
    scraperwiki.sqlite.execute('CREATE TABLE rp_maps (routeing_point, mapname)')

    print "Processing XML"
    # This is horrid, and assumes that the PDF will be in the correct order.
    for _, cell in lxml.etree.iterparse(StringIO.StringIO(xmldata), tag='text'):
        if int(cell.attrib['top']) > 100:
            if cell.attrib['left'] == '38':
                rp = cell.xpath('string()').title()
            else:
                for mapname in cell.xpath('string()').split():
                    scraperwiki.sqlite.execute('INSERT INTO rp_maps VALUES (?, ?)',
                        (rp, mapname))
        cell.clear()

    print "Creating indexes"
    scraperwiki.sqlite.execute('CREATE INDEX maps_byrp ON rp_maps(routeing_point)')
    scraperwiki.sqlite.execute('CREATE INDEX rps_bymap ON rp_maps(mapname)')
    print "Committing"
    scraperwiki.sqlite.commit()
    print "Routeing point list processed"

示例#2

0

显示文件

文件： estyn_inspection_report_parser.py 项目： carriercomm/scraperwiki-scraper-vault

def do_it(pdfurl):
    '''
    Take the URL of a PDF, and use scraperwiki.pdftoxml and lxml to output the contents
    as a styled HTML div.
    '''

    pdfdata = urllib2.urlopen(pdfurl).read()
    pdfxml = scraperwiki.pdftoxml(pdfdata)
    root = lxml.etree.fromstring(pdfxml)

    # just turn it into plain text
    raw = ''
    for index, page in enumerate(root):
        for text in page:
            raw += ' '.join(text.xpath("descendant-or-self::text()"))
            raw += "\n"

    # pull out the grades
    data = {}
    grades = extract_grades(raw)
    for i in range(1,8):
        data['key_question_grade_%d'%(i,)] = grades[i]
    data['date_of_inspection'] = extract_date(raw)

    return data

示例#3

0

显示文件

文件： registro_proveedores_provincia_de_cordoba.py 项目： flyeven/scraperwiki-scraper-vault

def getpages(href):
    pdfdata = scraperwiki.scrape(href)
    xml = scraperwiki.pdftoxml(pdfdata)
    dom = lxml.etree.fromstring(xml)
    pages = list(dom)
    print "The pages are numbered:", [page.attrib.get("number") for page in pages]
    return pages

示例#4

0

显示文件

文件： safaricom.py 项目： flyeven/scraperwiki-scraper-vault

def getpdfs():
    html = parse('http://www.safaricom.co.ke/index.php?id=275').getroot()
    html.make_links_absolute()
    pdf_urls = html.xpath('//table[@class="contenttable" and @width="540"]/descendant::a/@href')
    
    for url in pdf_urls:
        save(['date_scraped', 'url'], {"date_scraped": DATE, "url": url, "pdfxml": pdftoxml(urlopen(url).read())}, 'pdfs')

示例#5

0

显示文件

文件： scraper.py 项目： Arbelto/f1_timing

def pdfGrabber(typ):
    #if src =='f1mediacentre': url = "http://www.fia.com/en-GB/mediacentre/f1_media/Documents/"+race+"-"+typ+".pdf"
    #http://184.106.145.74/fia-f1/f1-2012/f1-2012-08/eur-f1-2012-fp1-times.pdf
    #http://184.106.145.74/fia-f1/f1-2012/f1-2012-08/eur-f1-2012-fp1-classification.pdf
    ##trying http://184.106.145.74/fia-f1/f1-2012/f1-2012-08/eur-fp1-classification.pdf
    rnum='08'
    typ2=typ.replace('session','fp')
    if src =='f1mediacentre': url = "http://184.106.145.74/fia-f1/f1-2012/f1-2012-"+rnum+"/"+race+"-f1-2012-"+typ2+".pdf"
    else: url="http://dl.dropbox.com/u/1156404/"+race+"-"+typ+".pdf"
    #url='http://dl.dropbox.com/u/1156404/mal-race-analysis.pdf'
    pdfdata = urllib2.urlopen(url).read()
    print "The pdf file has %d bytes" % len(pdfdata)

    xmldata = scraperwiki.pdftoxml(pdfdata)
    '''
    print "After converting to xml it has %d bytes" % len(xmldata)
    print "The first 2000 characters are: ", xmldata[:2000]
    '''

    root = lxml.etree.fromstring(xmldata)
    
    pages = list(root)
    #print 'pre',pages
    print "The pages are numbered:", [ page.attrib.get("number")  for page in pages ]
    return pages

示例#6

0

显示文件

def parseReport(pdfurl,urn):
    '''
    Take the URL of a PDF, and use scraperwiki.pdftoxml and lxml to output the contents
    as a styled HTML div.
    '''
    try:
        pdfdata = urllib2.urlopen(pdfurl).read()
        if pdfdata == '':
            return "Failed to load/PDF does not exist"
        pdfxml = scraperwiki.pdftoxml(pdfdata)
        root = lxml.etree.fromstring(pdfxml)
        reportdata = []
        #print "URN %s URL %" % (pdfurl,urn)
    
        # Print each page of the PDF.
        for index, page in enumerate(root):
            data = PageSave(page, index,urn)
            reportdata.append(data)
            for ldata in data:
                #print data
                lldata = ldata.copy()
                lldata["urm"] = urn
                scraperwiki.sqlite.save(unique_keys=ldata.keys(), data=lldata, table_name="other")
        #print reportdata
        report = {'urn':urn, 'data':reportdata}
        print report
        scraperwiki.sqlite.save(unique_keys=["urn"], data=report)
        return "Success"
    except Exception, e:
        return "Error %s" % e

示例#7

0

显示文件

文件： pdf_to_html_anotate_1.py 项目： flyeven/scraperwiki-scraper-vault

def Main(pdfurl):
    '''
    Take the URL of a PDF, and use scraperwiki.pdftoxml and lxml to output the contents
    as a styled HTML div. 
    '''
    pdfdata = urllib2.urlopen(pdfurl).read()
    pdfxml = scraperwiki.pdftoxml(pdfdata)
    root = lxml.etree.fromstring(pdfxml)

    global styles
    fontspecs = { }

    # Get the PDF's internal styles: we'll use these to style the divs containing the PDF.
    for fontspec in root.xpath('page/fontspec'):
        id = fontspec.attrib.get('id')
        fontdesc = {'size':int(fontspec.attrib.get('size')), 'family':fontspec.attrib.get('family'), 'color':fontspec.attrib.get('color')}
        fontspecs[id] = fontdesc
        styles['div.fontspec-%s' % id] = 'color:%s;font-family:%s;font-size:%dpx' % (fontdesc['color'], fontdesc['family'], fontdesc['size'])

    # Output the view, with instructions for the user.
    print '<html dir="ltr" lang="en">'
    print '<head>'
    print '    <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>'
    print '    <title>PDF to XML text positioning</title>'
    print '    <style type="text/css" media="screen">%s</style>' % "\n".join([ "%s { %s }" % (k, v)  for k, v in styles.items() ])
    print '    <script type="text/javascript" src="http://ajax.googleapis.com/ajax/libs/jquery/1.4.2/jquery.min.js"></script>'
    print '    <script>%s</script>' % jscript
    print '</head>'

    # Print each page of the PDF.
    for index, page in enumerate(root):
        print Pageblock(page, index)

示例#8

0

显示文件

文件： postlistepythonlib.py 项目： Kagee/nuug-postliste-scrapers

    def preprocess(self, pdfurl, pdfcontent):
        print "Preprocessing PDF " + pdfurl
        if not pdfcontent:
            raise ValueError("No pdf content passed for " + pdfurl)
        if self.hiddentext:
            options = '-hidden'
        else:
            options = ''
        xml=scraperwiki.pdftoxml(pdfcontent, options)
        if self.debug:
            print xml
        pages=re.findall('(<page .+?</page>)',xml,flags=re.DOTALL)
        xml=None
#    print pages[:1][:1000]
        pagecount = 0
        datastore = []
        for page in pages:
            pagecount = pagecount + 1
            self.is_valid_page(pdfurl, pagecount, page)
            data = {
                'scrapedurl' : pdfurl,
                'pagenum' : pagecount,
                'pagecontent' : page,
            }
            datastore.append(data)
        if 0 < len(datastore):
            scraperwiki.sqlite.save(unique_keys=['scrapedurl', 'pagenum'], data=datastore, table_name=self.pagetable)
        else:
            raise ValueError("Unable to find any pages in " + pdfurl)
        pages = None

示例#9

0

显示文件

文件： inspectionreports_test2.py 项目： flyeven/scraperwiki-scraper-vault

def scrapereport(reportlink):
    boldline = 0
    html = scraperwiki.scrape(baseurl+reportlink)
    root = lxml.html.fromstring(html)
    links = root.cssselect("div#unusefulbottom a")
    #<div id="unusefulbottom">
    for link in links:
        print "LINK GRABBED WITH CSSSELECT", link
        print "link.attrib.get", link.attrib.get('href')
        downloadlink = link.attrib.get('href')
#    print " downloadlink[0].text_content()", downloadlink[0].text_content()
        pdfdata = urllib2.urlopen(baseurl+downloadlink).read()
        print "pdfdata", pdfdata
        xmldata = scraperwiki.pdftoxml(pdfdata)
        print "xmldata", xmldata
        pdfxml = lxml.etree.fromstring(xmldata)
        print "pdfxml", pdfxml
        boldtags = pdfxml.xpath('.//text')
        linenumber = 0
        for heading in boldtags:
            linenumber = linenumber+1
            #print "Heading:", heading.text
            if heading.text is not None:
#                mention = re.match(r'.*NMS.*',heading.text)
                mention = re.match(r'.*overall.*',heading.text)
                if mention:
                    print "FULL LINE", lxml.etree.tostring(heading, encoding="unicode", method="text")
#                    print "OVERALL", heading.text
#                    print "CHECK", pdfxml.xpath('.//text')[linenumber-1].text
#                    print "LINEAFTER", pdfxml.xpath('.//text')[linenumber].text
                    record['overall'] = lxml.etree.tostring(heading, encoding="unicode", method="text")
                    record['uniqueref'] = reportlink+"_"+str(boldline)
                    record['downloadlink'] = baseurl+downloadlink
                    scraperwiki.sqlite.save(['uniqueref'],record)

示例#10

0

显示文件

文件： pdfScraper.py 项目： lsulibraries/pdfscraper

def read_file_return_etree(uid):
    with open('cached_pdfs/{}.pdf'.format(uid), 'r') as f:
        pdfdata = f.read()                                    # str
    xmldata = scraperwiki.pdftoxml(pdfdata)                   # unicode
    xmldata = bytes(bytearray(xmldata, encoding='utf-8'))     # str
    element_tree = ET.fromstring(xmldata)
    return element_tree

示例#11

0

显示文件

文件： homicides-in-mexico-1997-2008.py 项目： carriercomm/scraperwiki-scraper-vault

def scrape_cieisp(year, text):
    if (year == 2010):
        pdfurl = "http://www.secretariadoejecutivosnsp.gob.mx/work/models/SecretariadoEjecutivo/Resource/131/1/images/INCIDENCIA_DELICTIVA_2010_030211.pdf"
    else:
        pdfurl = "http://www.secretariadoejecutivosnsp.gob.mx/work/models/SecretariadoEjecutivo/Resource/131/1/images/CIEISP" + `year` + ".pdf"
    a = scraperwiki.scrape(pdfurl)
    s = BeautifulSoup(scraperwiki.pdftoxml(a))

    dolosos_position = []
    i = 0
    for t in s.findAll('text'):
        if t.text == "DOLOSOS":
            if text == "POR ARMA DE FUEGO":
                dolosos_position.append(i+14)
            else:
                dolosos_position.append(i)
        i += 1

    all_text = s.findAll('text')
    #print all_text
    
    if (year <= 2008) :
        if (year >=2006):
            states_names = states3
        else:
            states_names = states2
    else:
        states_names = states

    for i in range(0,33):
        for j in range(1,14):
            record = {'State' : states_names[i], 'Year' : year, 'Month' : months[j-1], 'Homicides' : all_text[dolosos_position[i]+j].text, 'Crimetype' : text}
            scraperwiki.datastore.save(["State", "Year", "Month"], record)
    return

示例#12

0

显示文件

文件： national_bank_of_rwanda_microfinance_institutions.py 项目： flyeven/scraperwiki-scraper-vault

def getTablePages():
  url = "http://www.bnr.rw/docs/publicnotices/List%20of%20MFIs%20Update_Sept_%202011.pdf"
  pdfdata = urlopen(url).read()
  xmldata = pdftoxml(pdfdata)
  root = fromstring(xmldata)
  pages = list(root)
  return pages

示例#13

0

显示文件

文件： pdfDownloader.py 项目： oiclid/ScoDa_oil

def pdfParser(pdfdata,path):
	txt=[]
	
	pdfdata = urllib2.urlopen(url).read()
	xmldata = scraperwiki.pdftoxml(pdfdata)
	root = lxml.etree.fromstring(xmldata)

	# 4. Have a peek at the XML (click the "more" link in the Console to preview it).
	#print lxml.etree.tostring(root, pretty_print=True)

	# 5. How many pages in the PDF document?
	pages = list(root)
	#print "There are",len(pages),"pages"

	# 6. Iterate through the elements in each page, and preview them
	for page in pages:
		for el in page:
			if el.tag == "text":
				#print el.text, el.attrib
				if el.text!=None: txt.append(el.text)


	try:
		ftxt=open(path+'/text.txt','w')
		ftxt.write("\n".join(txt).encode('utf-8'))
		ftxt.close()
	except: pass

示例#14

0

显示文件

文件： dragon-util.py 项目： carriercomm/scraperwiki-scraper-vault

def urltohtml(url="http://www.madingley.org/uploaded/Hansard_08.07.2010.pdf"):
    import scraperwiki, urllib2, lxml.etree
    lazycache=scraperwiki.swimport('lazycache')
    pdfdata = lazycache.lazycache(url)

    xmldata = scraperwiki.pdftoxml(pdfdata)
    root = lxml.etree.fromstring(xmldata)
    pages = list(root)
    
    # this function has to work recursively because we might have "<b>Part1 <i>part 2</i></b>"
    def gettext_with_bi_tags(el):
        res = [ ]
        if el.text:
            res.append(el.text)
        for lel in el:
            res.append("<%s>" % lel.tag)
            res.append(gettext_with_bi_tags(lel))
            res.append("</%s>" % lel.tag)
            if el.tail:
                res.append(el.tail)
        return "".join(res)
    
    # print the first hundred text elements from the first page
    text=[]
    for page in pages:
        for el in list(page)[:100]:
            if el.tag == "text":
                text.append(gettext_with_bi_tags(el))
    return '\n'.join(text)

示例#15

0

显示文件

文件： scraper.py 项目： howawong/hong_kong_recycle_collection_points

def fetch_rows(url, x_threshold):
    points_rows = []
    districts_rows = []
    print url
    f = requests.get(url)
    pdf = scraperwiki.pdftoxml(f.content)
    root = lxml.etree.fromstring(pdf)
    pages = root.xpath("//page")
    for page in pages:
        page_number = int(page.xpath("./@number")[0])
        texts = page.xpath("./text")
        tag = 0
        for text in texts:
            x = int(text.xpath("./@left")[0])
            y = int(text.xpath("./@top")[0])
            if text.text is not None:
                
                s = re.sub(r'\d+\.', '', text.text).strip()
                m = re.match(r'\d+\.', text.text)
                if m is not None:
                    tag = int(m.group(0).strip()[0:-1])
                if len(s) == 0:
                    continue
                d = {'text': s, 'y': y, 'x': x, 'page': page_number, 'tag': tag}
                if x >= x_threshold:
                    points_rows.append(d)                   
                else:
                    districts_rows.append(d)
    return  (points_rows, districts_rows)

示例#16

0

显示文件

文件： md_school_testing_scores_2011.py 项目： flyeven/scraperwiki-scraper-vault

def parse_pdf(url):
    pdf_data = urllib2.urlopen(url).read()
    assert len(pdf_data) > 0
    
    xml_data = sw.pdftoxml(pdf_data)
    tree = etree.parse(StringIO(xml_data))
    root = tree.getroot()
    print root.xpath('//*text[left<200]')

示例#17

0

显示文件

文件： birmingham-council-500-spending.py 项目： flyeven/scraperwiki-scraper-vault

def GetPDFtrans():
    pdfurl = "http://www.birmingham.gov.uk/cs/Satellite?%26ssbinary=true&blobcol=urldata&blobheader=application%2Fpdf&blobheadername1=Content-Disposition&blobkey=id&blobtable=MungoBlobs&blobwhere=1223439077563&blobheadervalue1=attachment%3B+filename%3D444523Payments+over+%C2%A3500+August.pdf"
    c = urllib.urlopen(pdfurl).read()
    x = scraperwiki.pdftoxml(c)
    print x[:4000]
    urlup = "http://seagrass.goatchurch.org.uk/~julian/cgi-bin/uu.cgi"
    d = urllib.urlencode({"name":"brumpdf500xml", "contents":x})
    print urllib.urlopen(urlup, d).read()

示例#18

0

显示文件

文件： inspectionreportssecondary_test.py 项目： flyeven/scraperwiki-scraper-vault

def scrapeschool(url):
    print url
    html = scraperwiki.scrape(url)
    print html
    root = lxml.html.fromstring(html)
        #create an empty variable 'record', which is a dictionary
    record = {}
        #create a uniqueid that we'll add to with each record later
    uniqueid = 0
    record["school"] = root.cssselect("h1")[0].text_content()
    record["parentviewurl"] = root.xpath(".//div[@id='content']//a")[0].attrib.get('href')
#Expressed more simply, this could take up three lines like so:
#    parentviewurls = root.xpath(".//div[@id='content']//a")
#    parentviewurl = parentviewurls[0].attrib.get('href')
#    record["parentviewurl"] = parentviewurl
    record["URN"] = root.xpath(".//div[@id='content']//p//strong")[0].text_content()
    record["Address"] = lxml.etree.tostring(root.xpath(".//div[@id='content']//p")[1])
    report1url = root.xpath(".//table[@summary='Previous reports']//td//a")[0].attrib.get('href')
    record["report1url"] = report1url
#    record["inspectiondate"] = root.xpath(".//table[@summary='Previous reports']//td")[1].text_content
    uniqueid =+ 1
    record["uniqueid"] = uniqueid
    print record
#use the urllib2 library's .urlopen function to open the full PDF URL, and the .read() function to read it into a new object, 'pdfdata'
    pdfdata = urllib2.urlopen(baseurl+report1url).read()
#use pdftoxml to convert that into an xml document
    pdfread = scraperwiki.pdftoxml(pdfdata)
    print pdfread
#use lxml.etree to convert that into an lxml object
    pdfroot = lxml.etree.fromstring(pdfread)
    leadership = re.search(r'b>The quality of .* <b',pdfread)
    if leadership:
#        print linenumber
        print leadership.group()
#find all <b> tagged lines - headings?
    lines = pdfroot.findall('.//text')
    linenumber = 0
    for line in lines:
        linenumber = linenumber+1
        if line.text:
            FSM = re.match(r'.* free school meals .*',line.text)
            if FSM:
                print linenumber
                print FSM.group()
#                if pdfroot.xpath('.//text')[linenumber-2].text:
                print pdfroot.xpath('.//text')[linenumber-2].text
                print pdfroot.xpath('.//text')[linenumber-1].text
                print pdfroot.xpath('.//text')[linenumber].text
#                if pdfroot.findall('.//text')[linenumber].text:
                record["FSM3"] = pdfroot.findall('.//text')[linenumber].text
                print record

#UP TO HERE. NEED TO:
#IDENTIFY THE LINE WE WANT - PERHAPS .XPATH AND (CONTAINS)
#GRAB X CHARACTERS AFTER THAT - OR:
#IDENTIFY THE INDEX POSITION OF THAT <TEXT><B> HEADING AND THE NEXT ONE AND GRAB ALL LINES BETWEEN

    scraperwiki.sqlite.save(["uniqueid"],record)

示例#19

0

显示文件

文件： swale_planning_applications.py 项目： carriercomm/scraperwiki-scraper-vault

    def get_id_period (self, date):

        from_iso_dt, to_iso_dt = util.inc_dt(date.strftime(util.ISO8601_DATE), util.ISO8601_DATE, self.PERIOD_TYPE)
        from_dt = util.get_dt(from_iso_dt, util.ISO8601_DATE)
        to_dt = util.get_dt(to_iso_dt, util.ISO8601_DATE)

        url_date = to_dt.strftime(self.search_url1)
        if self.DEBUG: print url_date
        try:
            response = self.br.open(url_date)
        except:
            url_date = to_dt.strftime(self.search_url2)
            if self.DEBUG: print url_date
            try:
                response = self.br.open(url_date)
            except:
                url_date = to_dt.strftime(self.search_url3)
                if self.DEBUG: print url_date
                try:
                    response = self.br.open(url_date)
                except:
                    response = None

        final_result = []
        if response:
            html = response.read()
            if self.DEBUG: print html
            url = response.geturl()
            result = scrapemark.scrape(self.scrape_ids1, html, url)
            if not result or not result.get('records'):
                result = scrapemark.scrape(self.scrape_ids1a, html, url)
            if not result or not result.get('records'):
                result = scrapemark.scrape(self.scrape_ids2, html, url)
            if not result or not result.get('records'):
                pdfxml = scraperwiki.pdftoxml(html)
                if self.DEBUG: print pdfxml
                result = scrapemark.scrape(self.scrape_ids3, pdfxml, url)     
            if result and result.get('records'):
                for rec in result['records']:
                    rec['url'] = url_date
                    rec['date_received'] = to_iso_dt
                    rec['start_date'] = to_iso_dt
                    if rec.get('agent1'):
                        if rec.get('agent2'):
                            rec['agent_address'] = rec['agent1'] + ' ' + rec['agent2']
                        else:
                            rec['agent_address'] = rec['agent1']
                        del rec['agent1']
                        if 'agent2' in rec: del rec['agent2']
                self.clean_ids(result['records'])
                for rec in result['records']: # note do this after record cleaning
                    rec['date_scraped'] = datetime.now().strftime('%Y-%m-%dT%H:%M:%S')
                final_result.extend(result['records'])
        #else:
        #    return [], None, None

        return final_result, from_dt, to_dt

示例#20

0

显示文件

文件： pdftoxml.py 项目： spudmind/undertheinfluence

 def __init__(self, filename):
     # load the pdf
     with open(filename) as f:
         pdf_string = f.read()
     # convert to xml
     xml_string = scraperwiki.pdftoxml(pdf_string)
     # parse xml
     self._xml = lxml.etree.fromstring(xml_string)
     self._pages = [self._page_to_blocks(page_num) for page_num in range(1, self.page_count() + 1)]

示例#21

0

显示文件

文件： frabjous_pdf_to_database.py 项目： carriercomm/scraperwiki-scraper-vault

def ConvertPDFtoSqlite(docname, pdfurl):
    print "converting", docname, pdfurl
    pdfdata = urllib2.urlopen(pdfurl).read()
    pdfxml = scraperwiki.pdftoxml(pdfdata)
    try:
        root = lxml.etree.fromstring(pdfxml)
    except lxml.etree.XMLSyntaxError, e:
        print "Bad xml file", str(e)
        print pdfxml[:19000]
        return

示例#22

0

显示文件

文件： forkof_pdf_to_html_preview_1.py 项目： flyeven/scraperwiki-scraper-vault

def Main(pdfurl):
    '''
    Take the URL of a PDF, and use scraperwiki.pdftoxml and lxml to output the contents
    as a styled HTML div. 
    '''
    pdfdata = urllib2.urlopen(pdfurl).read()
    pdfxml = scraperwiki.pdftoxml(pdfdata)
    root = lxml.etree.fromstring(pdfxml)

    global styles
    fontspecs = { }

    # Get the PDF's internal styles: we'll use these to style the divs containing the PDF.
    for fontspec in root.xpath('page/fontspec'):
        id = fontspec.attrib.get('id')
        fontdesc = {'size':int(fontspec.attrib.get('size')), 'family':fontspec.attrib.get('family'), 'color':fontspec.attrib.get('color')}
        fontspecs[id] = fontdesc
        styles['div.fontspec-%s' % id] = 'color:%s;font-family:%s;font-size:%dpx' % (fontdesc['color'], fontdesc['family'], fontdesc['size'])

    # Output the view, with instructions for the user.
    print '<html dir="ltr" lang="en">'
    print '<head>'
    print '    <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>'
    print '    <title>PDF to XML text positioning</title>'
    print '    <style type="text/css" media="screen">%s</style>' % "\n".join([ "%s { %s }" % (k, v)  for k, v in styles.items() ])
    print '    <script type="text/javascript" src="http://ajax.googleapis.com/ajax/libs/jquery/1.4.2/jquery.min.js"></script>'
    print '    <script>%s</script>' % jscript
    print '</head>'

    print '<div class="info" id="info1">&lt;text block&gt;</div>'
    print '<div class="info" id="info2">&lt;position&gt;</div>'

    print '<div class="heading">'
    print '<h2>Graphical preview of scraperwiki.pdftoxml(pdfdata)</h2>'

    print '<p>Click on a text line to see its coordinates and any other text that shares the same column or row.'
    print '   Useful for discovering what coordinates to use when extracting rows from tables in a document.</p>'
    print '<p>To do: track the coordinates of the mouse and cross reference with <a href="/cropper">cropper</a> technology.</p>'

    print '<p class="href"><a href="%s">%s</a></p>'% (pdfurl, pdfurl)
    print '<form id="newpdfdoclink">'
    print '    Another PDF link:'
    print '    <input type="text" name="url" value="" title="paste in url of new document">'
    print '    <input type="submit" value="Go">'
    print '</form>'
    ttx = re.sub('<', '&lt;', pdfxml)
    ttx = re.sub('\n', '\r\n', ttx) 
    print '<textarea class="pdfprev">%s</textarea>' % ttx[:5000]
    print '</div>'

    print '<p>There are %d pages</p>' % len(root)

    # Print each page of the PDF.
    for index, page in enumerate(root):
        print Pageblock(page, index)

示例#23

0

显示文件

文件： sodiscesi-parsni-katastrsko-obcino-in-shrani.py 项目： flyeven/scraperwiki-scraper-vault

def parsepdf(pdfurl):
    a = scraperwiki.scrape(pdfurl)
    s = BeautifulSoup(scraperwiki.pdftoxml(a))
    kobcine = {}
    for t in s.findAll('text'):
        if t.text != " ":
            ko_ime = find_ko(t.text)
            if ko_ime:
                ko = kobcine.setdefault(ko_ime, 0)
                kobcine[ko_ime] = ko + 1
    return kobcine

示例#24

0

显示文件

文件： md_school_testing_scores_2011_-_headings_scraper.py 项目： flyeven/scraperwiki-scraper-vault

def paginating(url):
    pages = range(13, 276, 2)
    pdf_data = urllib2.urlopen(url).read()
    xml_data = sw.pdftoxml(pdf_data)
    html_data = html.fromstring(xml_data)
    for page in pages:
        page_data = html_data.cssselect('page')[page]
        
        print html.tostring(page_data)
        
        parse_pdf_header(page_data)

示例#25

0

显示文件

文件： scraper.py 项目： bjh21/atoc_routeing_guide

def pink_pages():
    #
    # Pink pages
    #

    print "Loading pink pages (station to RP mapping)"
    url = "http://www.atoc.org/clientfiles/File/routeing_point_identifier.pdf"

    pdfdata = urllib2.urlopen(url).read()
    print "The pdf file has %d bytes" % len(pdfdata)
    print "Converting to XML"

    xmldata = scraperwiki.pdftoxml(pdfdata)
    # print "After converting to xml it has %d bytes" % len(xmldata)
    # print "The first 20000 characters are: ", xmldata[:20000]

    print "Parsing XML"
    root = lxml.etree.fromstring(xmldata)

    # Each station is on a single line consisting of the station name and then
    # the various routeing points.

    stncells = root.xpath('//text[@left=37]')

    scraperwiki.sqlite.execute('DROP TABLE IF EXISTS routeing_points')
    scraperwiki.sqlite.execute('CREATE TABLE routeing_points (station, routeing_point)')
    scraperwiki.sqlite.execute('DROP TABLE IF EXISTS groups')
    scraperwiki.sqlite.execute('CREATE TABLE groups (station, stngroup)')

    find_other_cells = lxml.etree.XPath('following-sibling::text[@top=$this/@top]')

    print "Extracting station list"
    for stncell in stncells:
        # Find other cells on the same row of the same page.
        othercells = find_other_cells(stncell, this = stncell)
        for othercell in othercells:
            stn, other = stncell.xpath('string()'), othercell.xpath('string()')
            stn = stn.title()
            if other == "Routeing Point":
                other = stn
            if other.endswith(" Routeing Point Member"):
                other = other[:-22]
                scraperwiki.sqlite.execute('INSERT INTO groups VALUES (?, ?)',
                    (str(stn), str(other)), verbose=0)
            scraperwiki.sqlite.execute('INSERT INTO routeing_points VALUES (?, ?)',
                (str(stn), str(other)), verbose=0)

    print "Creating indexes"
    scraperwiki.sqlite.execute('CREATE INDEX points_bystn ON routeing_points(station)')
    scraperwiki.sqlite.execute('CREATE INDEX groups_bystn ON groups(station)')
    print "Committing"
    scraperwiki.sqlite.commit()
    print "Pink pages processed"

示例#26

0

显示文件

文件： hansard_pdf_scraper.py 项目： flyeven/scraperwiki-scraper-vault

def main():
    link_src = \
    '''https://api.scraperwiki.com/api/1.0/datastore/sqlite?format=jsondict&name=malaysian_parliament_hansard_url&query=select%20*%20from%20swdata%20limit%2010'''
    links = urllib2.urlopen(link_src)
    links_data = json.load(links)
    pdf_url = links_data[2]['url'].replace(' ','%20')
    print pdf_url
    pdf_data = urllib2.urlopen(pdf_url).read()
    xml_data = scraperwiki.pdftoxml(pdf_data)
    xml_data = xml_data.replace('<b>','').replace('</b>','')
    print xml_data
    root = etree.fromstring(xml_data)

示例#27

0

显示文件

文件： haute_autorite_de_sante_-_unsatisfactory_drugs.py 项目： flyeven/scraperwiki-scraper-vault

def parse_pdf(url, name, page_url):
    url = url.encode('ascii')
    name = name.encode('utf-8')
    print name
    pdf_url = "http://www.has-sante.fr/portail/" + url
    avis = " "
    avis2 = " "
    
    #follows the first link
    a = scraperwiki.scrape(pdf_url)
    a = a.lower()    

    #finds the actual link (there's a redirect)
    soup = BeautifulSoup(a)
    
    pdf_url = soup.find("meta")

    pdf_url = pdf_url['content']
    pdf_url = pdf_url.replace("0; url='../../../../", "http://www.has-sante.fr/portail/")
    pdf_url = pdf_url[:-1]
    pdf_url = pdf_url.encode('ascii')
    
    #now for the real pdf
    try:
        b = scraperwiki.scrape(pdf_url)
        s = BeautifulSoup(scraperwiki.pdftoxml(b))
    
        #some basic regex to extract meaningful info
        for t in s.findAll('text'):
            if t.text != " ": 
                pattern = '^.*?int.r.t de sant. publique.*?faible.*?$'
                pattern2 = '^.*?service m.dical rendu par.*?$'
                if (re.search(pattern, t.text)):
                    avis = t.text
                    avis = avis.encode('utf-8')
                    print avis
                elif(re.search(pattern2, t.text)):
                    avis2 = t.text
                    avis2 = avis2.encode('utf-8')
                    print avis2
    
        #now we've got everything, we're adding it to the DB
        data = {}
        medoc_name = name
        data['Name'] = medoc_name
        data['pdf_url'] = pdf_url
        data['page_url'] = page_url
        data['interet_sante'] = avis + "\n" + avis2
        data[medoc_name] = medoc_name
        scraperwiki.datastore.save(['Name'], data)

    except: 
        print "Error" + pdf_url

示例#28

0

显示文件

文件： maanedlig-statsregnskap-pdf-utgifter-okt2010.py 项目： carriercomm/scraperwiki-scraper-vault

def process_pdf(pdfurl):
# (harder example to work on: http://www.nihe.gov.uk/schemes_accepted_010109_to_310309.pdf )
    pdfdata = urllib.urlopen(pdfurl).read()
    pdfxml = scraperwiki.pdftoxml(pdfdata)
    s = BeautifulSoup(pdfxml)
    entrylines = []
    for text in s.findAll('text'):
        #print text
        entrylines.append(text)
        left = int(text['left'])
        if 82 == left:
            periode = text.text
            innut, m, y = periode.split(" ")
        if 107 == left:
            kapittel = text.text
        if 163 == left:
            post = text.text
        if 704 <= left and left <= 782:
            overfraifjor = text.text
        if 822 <= left and left <= 867:
            bevilgning = text.text
        if 920 <= left and left <= 965:
            samlbevilgning = text.text
        if 1011 <= left and left <= 1056:
            regnskap = text.text
        if 1124 <= left and left <= 1156 and u"1000 kr" != text.text.strip() and post is not None:
            rest = text.text
            if overfraifjor is None or bevilgning is None or samlbevilgning is None or regnskap is None or rest is None:
                error(entrylines, kapittel, post, overfraifjor, bevilgning, samlbevilgning, regnskap, rest)
            data = {
                'periode' : periode,
                'year' : y,
                'month' : m,
                'type' : innut,
                'kapittel' : kapittel,
                'post' : post,
                'overfraifjor' : valstr2int(overfraifjor),
                'bevilgning' : valstr2int(bevilgning),
                'samlbevilgning' : valstr2int(samlbevilgning),
                'regnskap' : valstr2int(regnskap),
                'rest' : valstr2int(rest),
            }
            #print data
            #time.sleep(1)
            scraperwiki.sqlite.save(unique_keys=['periode', 'kapittel', 'post'], data=data)
            post = None
            overfraifjor = None
            bevilgning = None
            samlbevilgning = None
            regnskap = None
            rest = None
            entrylines = []

示例#29

0

显示文件

文件： nycrime_single_test.py 项目： flyeven/scraperwiki-scraper-vault

def scrape_and_look_for_next_link(url):
    html = scraperwiki.scrape(url)
    soup = BeautifulSoup(html)
    find_link = soup.findAll(href=re.compile("/downloads/pdf/crime_statistics/"))
    next_link = [None]*(len(find_link))
    rep_link = [None]*(len(find_link))
    for i in range(len(find_link)):
        next_link[i] = find_link[i]['href']
        rep_link[i] = next_link[i].replace('../..','http://www.nyc.gov/html/nypd')
    for i in range(len(rep_link)):
        a = scraperwiki.scrape(rep_link[i]) #here I call my previously defined function to convert and scrape the pdf
        soup_pdf = BeautifulSoup(scraperwiki.pdftoxml(a))
        scrape_table(soup_pdf)

示例#30

0

显示文件

文件： extract_cv.py 项目： steko/datiasn-pdf

def extract_birth(cvfile):
    '''Extract birth year from PDF file with CV.'''

    pdf = open(CVFILE, 'rb')
    xml = scraperwiki.pdftoxml(pdf.read())

    root = lxml.etree.fromstring(xml)
    birthstr = root.xpath('//text[@top="320"]')[0].text
    
    mf = 'M' if birthstr[3] == 'o' else 'F'
    birthyear = birthstr[-4:]
    
    print(mf, birthyear)

示例#31

0

显示文件

文件： esics.py 项目： Biniou/pdftoics

def pdf_scrape(pdf, directory):
    '''Convert pdf to xml'''

    with open("pdf/" + directory + "/" + pdf) as u:
        xml = pdftoxml(u.read())

    if not os.path.exists("xml"):
        os.mkdir("xml")

    with open("xml/" + pdf + ".xml", "w") as w:
        w.write(xml)

    return xml

示例#32

0

显示文件

def main():
    url="http://governor.ny.gov/citizenconnects/assets/document/CitizenConnectsdoc.pdf"
    pdfdata = urllib2.urlopen(url).read()
    xmldata = scraperwiki.pdftoxml(pdfdata)

    rootdata=lxml.etree.fromstring(xmldata)
    pages = list(rootdata)

#    print "The pages are numbered:", [ page.attrib.get("number")  for page in pages ]

    for page in pages:
        entries = getText(page)
        store(entries)

示例#33

0

显示文件

def Main(pdfurl):
    '''
    Take the URL of a PDF, and use scraperwiki.pdftoxml and lxml to output the contents
    as a styled HTML div. 
    '''
    pdfdata = urllib2.urlopen(pdfurl).read()
    pdfxml = scraperwiki.pdftoxml(pdfdata)
    try:
        root = lxml.etree.fromstring(pdfxml)
    except lxml.etree.XMLSyntaxError, e:
        print str(e), str(type(e)).replace("<", "&lt;")
        print pdfurl
        print pdfxml.replace("<", "&lt;")
        root = []

示例#34

0

显示文件

def scrape_pdf(url):
    '''
    Scrape data from PDF at URL.
    '''
    try:
        pdf_data = urlopen(url).read()
    except:
        return "#MISSING!"

    pdf_xml = scraperwiki.pdftoxml(pdf_data)
    root = lxml.etree.fromstring(pdf_xml)
    #pages = list(root)
    full_text = get_pdf_text(root)
    return full_text

示例#35

0

显示文件

def process_pdf(pdfurl):
    pdfxml = u.findInCache(pdfurl,
                           verbose=True)  # look for html parse in cache
    if pdfxml is None:  # a html parse is not cached
        pdfdata = lazycache.lazycache(
            pdfurl,
            verbose=True)  # look for pdf document in cache, if not download
        pdfxml = scraperwiki.pdftoxml(pdfdata,
                                      "-hidden")  # parse pdf text to html
        u.putInCache(pdfurl, pdfxml, verbose=True)  # save cache of html parse

    beautifulxml = BeautifulSoup(
        pdfxml)  # convert html to BeautifulSoup(4) object

    for page in beautifulxml.find_all('page'):
        FIRSTPAGE = 6
        LASTPAGE = 6
        if int(page['number']) < FIRSTPAGE:
            continue
        if int(page['number']) == FIRSTPAGE:
            print "*******************************************"
            print "***** FIRSTPAGE #%d while developing ******" % (FIRSTPAGE)
            print "*******************************************"
        if int(page['number']) == LASTPAGE + 1:
            print "*******************************************"
            print "****** LASTPAGE #%d while developing ******" % (LASTPAGE)
            print "*******************************************"
            break

        print("*******************************************")
        print("********** Working on page #%s **********" % page['number'])
        print("*******************************************")
        elementList = deque(
            page.find_all('text'))  # we want to be able to use popleft
        d(elementList)
        while True:
            try:
                currElement = elementList.popleft()
                if "Innhold:" in currElement.text and currElement.b:  # we found a "Innhold:"-header
                    entry = parseDocumentRecord(currElement, elementList)
                    print entry
                    scraperwiki.sqlite.save(
                        unique_keys=["innhold", "sakstittel"], data=entry)
                    d("back in process_pdf")
                #else:
                #print currElement.text
            except IndexError, e:
                d("No more text elements on page (%s)" % e)
                break

示例#36

0

显示文件

def Main(url):
    tmpfile = tempfile.gettempdir() + "/45_networkrail.zip"
    tmpdir = tempfile.gettempdir() + "/45_networkrail"
    #+str(random.randint(2, 1000000000));
    urllib.urlretrieve(url, tmpfile)

    with zipfile.ZipFile(tmpfile, 'r') as myzip:
        myzip.extractall(tmpdir)

    f = open(tmpdir + "/completeTimetable.pdf", 'r')
    pdfxml = scraperwiki.pdftoxml(f.read())
    #print(os.listdir(tmpdir));
    #print(pdfxml);
    root = lxml.etree.fromstring(pdfxml)
    print '<p>There are %d pages</p>' % len(root)
    print etree.tostring(root[0])

示例#37

0

显示文件

def scrapepdf(url):
    #use the urllib2 library's .urlopen function to open the full PDF URL, and the .read() function to read it into a new object, 'pdfdata'
    pdfdata = urllib2.urlopen(url).read()
    #use pdftoxml to convert that into an xml document
    pdfread = scraperwiki.pdftoxml(pdfdata)
    print pdfread
    #use lxml.etree to convert that into an lxml object
    pdfroot = lxml.etree.fromstring(pdfread)
    #find all <text> tags and put in list variable 'lines'
    lines = pdfroot.findall('.//text')
    #create variable 'linenumber', initialised at 0
    linenumber = 0
    record = {}
    #loop through each item in 'lines' list
    for line in lines:
        #add one to 'linenumber' so we can track which line we're dealing with
        linenumber = linenumber + 1
        #if 'line' has some text:
        if line.text is not None:
            #create a new variable 'mention' that is filled with the result of
            #using the 're' library's .match function
            mention = re.search(r'.*black.*', line.text)
            if mention:
                print line.text
                #the RANGE function generates a list from the first parameter to the second,
                #e.g. range(5,8) would make [5, 6, 7] - it doesn't include the 'end' of the range
                #in this case we're using the line number minus 2, and the linenumber as our start and end points
                print range(linenumber - 2, linenumber + 1)
                linebefore = "EMPTY LINE"
                lineafter = "EMPTY LINE"
                incontextlist = []
                if pdfroot.xpath('.//text')[linenumber - 2].text:
                    linebefore = pdfroot.xpath('.//text')[linenumber - 2].text
                    incontextlist.append(linebefore)
                incontextlist.append(
                    pdfroot.xpath('.//text')[linenumber - 1].text)
                if pdfroot.xpath('.//text')[linenumber].text is not None:
                    lineafter = pdfroot.xpath('.//text')[linenumber].text
                    incontextlist.append(lineafter)
                print "mention.group()", mention.group()
                print "CAN YOU SEE ME?", ''.join(incontextlist)
                record["mention in context"] = ''.join(incontextlist)
                record["linenumber"] = linenumber
                #this stores the 'url' variable which is passed right at the start of this function: def scrapepdf(url):
                record["url"] = url
                print record
                scraperwiki.sqlite.save(["linenumber", "url"], record)

示例#38

0

显示文件

文件： scraper.py 项目： PSkydel07/current-week-reported-crime-city-wide-and-for-prec

def scrape_and_look_for_next_link(url):
    html = scraperwiki.scrape(url)
    soup = BeautifulSoup(html)
    find_link = soup.findAll(
        href=re.compile("/downloads/pdf/crime_statistics/"))
    next_link = [None] * (len(find_link))
    rep_link = [None] * (len(find_link))
    for i in range(len(find_link)):
        next_link[i] = find_link[i]['href']
        rep_link[i] = next_link[i].replace('../..',
                                           'http://www.nyc.gov/html/nypd')
    for i in range(len(rep_link)):
        a = scraperwiki.scrape(
            rep_link[i]
        )  #here I call my previously defined function to convert and scrape the pdf
        soup_pdf = BeautifulSoup(scraperwiki.pdftoxml(a))
        scrape_table(soup_pdf)

示例#39

0

显示文件

def ExtractPdf(year, nz, pdfbin, lurl):
    mnz = re.match("(...).*?(?:\d\d)?(\d\d)?_3.pdf", nz)
    assert mnz, nz
    assert mnz.group(1).lower() in m3, nz
    dnz = "%d-%02d" % (mnz.group(2) and int(mnz.group(2)) + 2000
                       or int(year), m3.index(mnz.group(1).lower()) + 1)
    #print "date", dnz
    root = lxml.etree.fromstring(scraperwiki.pdftoxml(pdfbin))
    currentcountry = None
    currentmission = None
    ldata = []
    data = None
    for page in list(root):
        rtblocks = []
        #print lxml.etree.tostring(page)
        for text in page:
            if text.tag != "text":
                continue

            if 130 <= int(text.attrib.get("left")) <= 140:
                #print lxml.etree.tostring(text)
                currentmission = None
                currentcountry = text_content(text).strip()
            if 276 <= int(text.attrib.get("left")) <= 280:
                if rtblocks and data:
                    lndata = parsemissionblock(rtblocks, data)
                    ldata.extend(lndata)
                currentmission = text_content(text).strip()
                data = {
                    "link": lurl,
                    "nz": nz,
                    "month": dnz,
                    "country": currentcountry,
                    "mission": currentmission,
                    "year": year
                }
                rtblocks = []
            if int(text.attrib.get("left")) > 350:
                rtblocks.append(text)

        if rtblocks and data:
            lndata = parsemissionblock(rtblocks, data)
            ldata.extend(lndata)
    scraperwiki.sqlite.save(["month", "country", "mission", "desc"], ldata)
    return dnz, len(ldata)

示例#40

0

显示文件

def scrape_pdf(url):
    '''
    Scrape data from PDF at URL.
    '''
    try:
        pdf_data = urlopen(url).read()
    except:
        return (None, None, None)

    pdf_xml = scraperwiki.pdftoxml(pdf_data)
    root = lxml.etree.fromstring(pdf_xml)
    page0 = root.find('page')
    try:
        content = dict(parse(list(tokenize(page0))))
    except ParseError:
        content = None
    full_text = get_pdf_text(root)
    return pdf_xml, full_text, content

示例#41

0

显示文件

文件： hdo-stortingstidende-voteringer_1.py 项目： pombredanne/scraperwiki-scraper-vault

def scrapepdf(pdfurl):
    #print "scraping " + pdfurl
    pdfdata = urllib.urlopen(pdfurl).read()
    pdfxml = scraperwiki.pdftoxml(pdfdata)
    s = BeautifulSoup(pdfxml)
    #print s

    casenr = None
    datestr = None
    daynr = None
    last_line = ""
    for idx, text in enumerate(s.findAll('text')):
        msg = text.text
        #print msg
        if 0 == msg.find(u"Møte "):
            datestr = datestr2date(msg.split("den ")[1].split(" kl.")[0])
            #print datestr
        if 0 == msg.find("D a g s o r d e n (nr."):
            daynr = msg.split(")")[0].split(".")[1]
            continue
        if -1 != msg.find("Votering i sak nr."):
            #print msg
            casenr = msg.split("nr.")[1].strip()
            continue
        elif -1 != msg.find("Votering i sak "):
            #print msg
            casenr = msg.split("i sak ")[1].strip()
            continue
        if -1 != msg.find("enstemmig bifalt") or -1 != msg.find(
                "enstemmig vedtatt") or (
                    (-1 != msg.find("bifalt") or -1 != msg.find("vedtatt"))
                    and -1 != last_line.find("ble enstemmig")):
            #print datestr, daynr, casenr, msg
            data = {
                'index': idx,
                'date': datestr,
                'daynr': daynr,
                'casenum': casenr,
                'msg': last_line + msg,
            }
            if casenr is not None:
                scraperwiki.sqlite.save(
                    unique_keys=['date', 'casenum', 'index'], data=data)
        last_line = msg

示例#42

0

显示文件

文件： atoc_routeing_guide.py 项目： yuandra/scraperwiki-scraper-vault

def maps():
    url = "http://www.atoc.org/clientfiles/File/Maps.pdf"

    print "Fetching maps"
    pdfdata = urllib2.urlopen(url).read()
    print "The pdf file has %d bytes" % len(pdfdata)


    print "Converting to XML"
    xmldata = scraperwiki.pdftoxml(pdfdata)
    #print "After converting to xml it has %d bytes" % len(xmldata)
    #print "The first 20000 characters are: ", xmldata[:20000]

    print "Converting PDF to PNGs"
    with tempfile.NamedTemporaryFile() as pdffile:
        pdffile.write(pdfdata)
        pdffile.flush()
        tmpdir = tempfile.mkdtemp()

        subprocess.check_call(['pdftoppm', '-r', '75', '-png',
            pdffile.name, os.path.join(tmpdir, 'p')])

    print "Parsing XML"
    root = lxml.etree.fromstring(xmldata)

    print "Processing maps"
    maptitles = root.xpath('//text[@height=100]')

    scraperwiki.sqlite.execute('DROP TABLE IF EXISTS maps')
    scraperwiki.sqlite.execute('CREATE TABLE maps (mapname, pageno, data)')

    for maptitle in maptitles:
        pageno = int(maptitle.xpath('string(../@number)'))
        with open(os.path.join(tmpdir, 'p-%03d.png' % (pageno)), 'rb') as f:
            scraperwiki.sqlite.execute('INSERT INTO maps VALUES (?,?,?)',
                (maptitle.xpath('string()'), pageno, base64.b64encode(f.read())))

    print "Creating index"
    scraperwiki.sqlite.execute('CREATE INDEX maps_bymap ON maps(mapname, pageno)')
    print "Committing maps"
    scraperwiki.sqlite.commit()
    print "Maps processed"

示例#43

0

显示文件

文件： atoc_routeing_guide.py 项目： yuandra/scraperwiki-scraper-vault

def yellow_pages():
    #
    # Yellow pages
    #
    # This file is huge, so we do the XML parsing incrementally.
    #

    print "Loading yellow pages (permitted route list)"
    url = "http://www.atoc.org/clientfiles/File/permitted_route_identifier.pdf"

    pdfdata = urllib2.urlopen(url).read()
    print "The pdf file has %d bytes" % len(pdfdata)

    print "Converting to XML"
    xmldata = scraperwiki.pdftoxml(pdfdata)
    # print "After converting to xml it has %d bytes" % len(xmldata)
    # print "The first 20000 characters are: ", xmldata[:20000]

    orig = None
    dest = None

    scraperwiki.sqlite.execute('DROP TABLE IF EXISTS permitted_routes')
    scraperwiki.sqlite.execute('CREATE TABLE permitted_routes (orig, dest, maps)')

    print "Processing XML"
    # This is horrid, and assumes that the PDF will be in the correct order.
    for _, cell in lxml.etree.iterparse(StringIO.StringIO(xmldata), tag='text'):
        if cell.attrib['height'] == '10':
            if cell.attrib['left'] == '80':
                orig = cell.xpath('string()')
            elif cell.attrib['left'] == '208':
                dest = cell.xpath('string()')
            else:
                scraperwiki.sqlite.execute('INSERT INTO permitted_routes VALUES (?, ?, ?)',
                    (orig, dest, cell.xpath('string()')))
        cell.clear()

    print "Creating index"
    scraperwiki.sqlite.execute('CREATE INDEX routes_bystn ON permitted_routes(orig, dest)')
    print "Committing"
    scraperwiki.sqlite.commit()
    print "Yellow pages done"

示例#44

0

显示文件

文件： pdfextractor_1.py 项目： yuandra/scraperwiki-scraper-vault

def process_pdf( url ):
    print "PROCESSING: " , url, 
    pdfdata = urllib2.urlopen(url).read()
    print len( pdfdata ), "bytes"
    if len(pdfdata) > 50000:
        return "" #too BIG Daddio!

    str = ''
    xmldata = scraperwiki.pdftoxml(pdfdata)

    root = lxml.etree.fromstring(xmldata)
    pages = list(root)


    def gettext_with_bi_tags(el):
        res = [ ]
        if el.text:
            res.append(el.text)
        for lel in el:
            res.append("<%s>" % lel.tag)
            res.append(gettext_with_bi_tags(lel))
            res.append("</%s>" % lel.tag)
            if el.tail:
                res.append(el.tail)
        return "".join(res)

    for page in pages :
        print page.attrib.get("number")
        # print the first hundred text elements from the first page
        page0 = pages[0]
        i = []
        data = []
        for el in list(page)[:1000]:
            if el.tag == "text":
                 data = {}
                 text = strip_tags( gettext_with_bi_tags(el) )
                 #data['text'] =  text
                 #data['url'] = url # The source of these words 
                 if text != '' and text != ' ':
                     #scraperwiki.sqlite.save(i, data)
                     str += " " + text
    return str

示例#45

0

显示文件

def getheadingsfrompdf(pdfurl):
    pdfdata = urllib2.urlopen(pdfurl).read()
    pdfxml = scraperwiki.pdftoxml(pdfdata)
    root = lxml.etree.fromstring(pdfxml)

    ldata = []
    for page in root:
        for el in page:
            # needs also to do concatenation between headings that run to two lines,
            # and handle headings with italics in them <i>
            if el.tag == "text" and el.attrib.get("font") == "10" and len(
                    el) == 1 and el[0].tag == "b":
                data = {
                    "pdfurl": pdfurl,
                    "pagenumber": int(page.attrib.get("number")),
                    "heading": el[0].text
                }
                ldata.append(data)
    scraperwiki.sqlite.save(["pdfurl", "pagenumber", "heading"], ldata,
                            "subheadings")

示例#46

0

显示文件

def scrape():
    u = file("cho-1-elementary.pdf")

    x = scraperwiki.pdftoxml(u.read())
    soup = BeautifulSoup(x)
    book = soup.get_text().split('\n')

    page = []
    newpage = []
    for x in book[36:]:
        newpage += [x]
        if x == '':
            pass
        elif x[0] == 'p':
            page += [newpage]

            newpage = []
        elif x[0] == '<':
            newpage = [x]
    return page

示例#47

0

显示文件

def iter_areas():
    import scraperwiki
    import StringIO

    pdfurl = "http://www.appc.org.uk/appc/filemanager/root/site_assets/pdfs/appc_register_entry_for_1_december_2009_to_28_february_2010.pdf"
    pdf = scraperwiki.scrape(pdfurl)
    print "Converting pdf to xml"
    xml = scraperwiki.pdftoxml(pdf)
    print "got xml"
    xmlfd = StringIO.StringIO(xml)
    doc = PdfToHTMLOutputParser(xmlfd)
    print "got doc"

    #import sys
    #doc = PdfToHTMLOutputParser(open(sys.argv[1]))

    org = {}
    grouper = TextGrouper()
    grouper.add_patterns(
        (re.compile("APPC register entry ", re.IGNORECASE), "dates"),
        ("Address(es) in UK", "address"),
        ("Address in UK", "address"),
        ("Contact", "contact"),
        ("Offices outside UK", "section"),
        (re.compile("providing PA consultancy services",
                    re.IGNORECASE), "section"),
        (re.compile("clients for whom", re.IGNORECASE), "section"),
    )

    def font_0(item):
        if item.fontspec.number == 0:
            item.props['type'] = 'name'
            item.props['grabbottom'] = 20
            print "Marked title:", repr(item.text)

    grouper.special_fns.append(font_0)
    grouper.group(doc.text(merge_verticals=True))
    #grouper.display()
    #grouper.display_full()
    for area in grouper.areas:
        yield area

示例#48

0

显示文件

文件： scraper.py 项目： howawong/hong_kong_secondary_school_vacancies

def fetch_record(url):
    f = requests.get(url)
    pdf = scraperwiki.pdftoxml(f.content)
    root = lxml.etree.fromstring(pdf)
    texts = root.xpath("//text")
    rows = {}
    for text in texts:
        top = int(text.xpath("./@top")[0]) / 10 * 10
        left = int(text.xpath("./@left")[0])
        value = text.text.strip()
        if top not in rows:
            rows[top] = []
        rows[top].append(value)
    rows_sorted = [rows[key] for key in sorted(rows.keys())]
    first_row = rows_sorted[0][0]
    words = first_row.split(" ")
    month = list(calendar.month_name).index(words[-2])
    if month == 0:
        raise Exception("Cannot parse month")
    year = int(words[-1])

    rows_sorted = rows_sorted[1:-1]
    header = [convert(s) for s in rows_sorted[0]]
    num_rows = len(header)
    for i in range(1, len(rows_sorted)):
        row = rows_sorted[i]
        k = num_rows - len(row)
        if k > 0:
            padding = [""] * k
            rows_sorted[i] = padding + rows_sorted[i]
    for i in range(1, len(rows_sorted)):
        if len(rows_sorted[i][0]) == 0:
            rows_sorted[i][0] = rows_sorted[i - 1][0]

        d = {"year": year, "month": month}
        for j in range(0, len(header)):
            d[header[j]] = rows_sorted[i][j]
        scraperwiki.sqlite.save(unique_keys=['year', 'month', 'district'],
                                data=d)
        print d

示例#49

0

显示文件

def scrape_cieisp(year, text):
    if (year == 2010):
        pdfurl = "http://www.secretariadoejecutivosnsp.gob.mx/work/models/SecretariadoEjecutivo/Resource/131/1/images/INCIDENCIA_DELICTIVA_2010_030211.pdf"
    else:
        pdfurl = "http://www.secretariadoejecutivosnsp.gob.mx/work/models/SecretariadoEjecutivo/Resource/131/1/images/CIEISP" + ` year ` + ".pdf"
    a = scraperwiki.scrape(pdfurl)
    s = BeautifulSoup(scraperwiki.pdftoxml(a))

    dolosos_position = []
    i = 0
    for t in s.findAll('text'):
        if t.text == "DOLOSOS":
            if text == "POR ARMA DE FUEGO":
                dolosos_position.append(i + 14)
            else:
                dolosos_position.append(i)
        i += 1

    all_text = s.findAll('text')
    #print all_text

    if (year <= 2008):
        if (year >= 2006):
            states_names = states3
        else:
            states_names = states2
    else:
        states_names = states

    for i in range(0, 33):
        for j in range(1, 14):
            record = {
                'State': states_names[i],
                'Year': year,
                'Month': months[j - 1],
                'Homicides': all_text[dolosos_position[i] + j].text,
                'Crimetype': text
            }
            scraperwiki.datastore.save(["State", "Year", "Month"], record)
    return

示例#50

0

显示文件

文件： copeland_planning_applications.py 项目： yuandra/scraperwiki-scraper-vault

    def get_id_period (self, date):

        from_iso_dt, to_iso_dt = util.inc_dt(date.strftime(util.ISO8601_DATE), util.ISO8601_DATE, self.PERIOD_TYPE)
        from_dt = util.get_dt(from_iso_dt, util.ISO8601_DATE)
        to_dt = util.get_dt(to_iso_dt, util.ISO8601_DATE)

        url_date = to_dt.strftime(self.search_url)
        if self.DEBUG: print url_date
        try:
            response = self.br.open(url_date)
        except:
            response = None

        final_result = []
        if response:
            pdfxml = scraperwiki.pdftoxml(response.read())
            if self.DEBUG: print pdfxml
            url = response.geturl()
            result = scrapemark.scrape(self.scrape_ids1, pdfxml, url)
            if not result or not result.get('records'):
                result = scrapemark.scrape(self.scrape_ids2, pdfxml, url)
            if result and result.get('records'):
                for rec in result['records']:
                    rec['url'] = url_date
                    rec['start_date'] = rec['date_received']
                    try:
                        map_ref_list = rec['os_map_ref'].split()
                        rec['easting'] = map_ref_list[0]
                        rec['northing'] = map_ref_list[1]
                        del rec['os_map_ref']
                    except:
                        pass
                self.clean_ids(result['records'])
                for rec in result['records']: # note do this after record cleaning
                    rec['date_scraped'] = datetime.now().strftime('%Y-%m-%dT%H:%M:%S')
                final_result.extend(result['records'])
        #else:
        #    return [], None, None

        return final_result, from_dt, to_dt # note weekly result might some times be legitimately empty

示例#51

0

显示文件

文件： pdf_to_html_anotate.py 项目： yuandra/scraperwiki-scraper-vault

def Main(pdfurl):
    '''
    Take the URL of a PDF, and use scraperwiki.pdftoxml and lxml to output the contents
    as a styled HTML div. 
    '''
    pdfdata = urllib2.urlopen(pdfurl).read()
    pdfxml = scraperwiki.pdftoxml(pdfdata)
    root = lxml.etree.fromstring(pdfxml)

    global styles
    fontspecs = {}

    # Get the PDF's internal styles: we'll use these to style the divs containing the PDF.
    for fontspec in root.xpath('page/fontspec'):
        id = fontspec.attrib.get('id')
        fontdesc = {
            'size': int(fontspec.attrib.get('size')),
            'family': fontspec.attrib.get('family'),
            'color': fontspec.attrib.get('color')
        }
        fontspecs[id] = fontdesc
        styles['div.fontspec-%s' %
               id] = 'color:%s;font-family:%s;font-size:%dpx' % (
                   fontdesc['color'], fontdesc['family'], fontdesc['size'])

    # Output the view, with instructions for the user.
    print '<html dir="ltr" lang="en">'
    print '<head>'
    print '    <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>'
    print '    <title>PDF to XML text positioning</title>'
    print '    <style type="text/css" media="screen">%s</style>' % "\n".join(
        ["%s { %s }" % (k, v) for k, v in styles.items()])
    print '    <script type="text/javascript" src="http://ajax.googleapis.com/ajax/libs/jquery/1.4.2/jquery.min.js"></script>'
    print '    <script>%s</script>' % jscript
    print '</head>'

    # Print each page of the PDF.
    for index, page in enumerate(root):
        print Pageblock(page, index)

示例#52

0

显示文件

def scrapereport(reportlink):
    boldline = 0
    html = scraperwiki.scrape(baseurl + reportlink)
    root = lxml.html.fromstring(html)
    links = root.cssselect("div#unusefulbottom a")
    #<div id="unusefulbottom">
    for link in links:
        print "LINK GRABBED WITH CSSSELECT", link
        print "link.attrib.get", link.attrib.get('href')
        downloadlink = link.attrib.get('href')
        #    print " downloadlink[0].text_content()", downloadlink[0].text_content()
        pdfdata = urllib2.urlopen(baseurl + downloadlink).read()
        print "pdfdata", pdfdata
        xmldata = scraperwiki.pdftoxml(pdfdata)
        print "xmldata", xmldata
        pdfxml = lxml.etree.fromstring(xmldata)
        print "pdfxml", pdfxml
        boldtags = pdfxml.xpath('.//text')
        linenumber = 0
        for heading in boldtags:
            linenumber = linenumber + 1
            #print "Heading:", heading.text
            if heading.text is not None:
                #                mention = re.match(r'.*NMS.*',heading.text)
                mention = re.match(r'.*overall.*', heading.text)
                if mention:
                    print "FULL LINE", lxml.etree.tostring(heading,
                                                           encoding="unicode",
                                                           method="text")
                    #                    print "OVERALL", heading.text
                    #                    print "CHECK", pdfxml.xpath('.//text')[linenumber-1].text
                    #                    print "LINEAFTER", pdfxml.xpath('.//text')[linenumber].text
                    record['overall'] = lxml.etree.tostring(heading,
                                                            encoding="unicode",
                                                            method="text")
                    record['uniqueref'] = reportlink + "_" + str(boldline)
                    record['downloadlink'] = baseurl + downloadlink
                    scraperwiki.sqlite.save(['uniqueref'], record)

示例#53

0

显示文件

文件： mapaservidorespublicossp.py 项目： yuandra/scraperwiki-scraper-vault

def carregaPagina(url):
    print url
    pdfdata = urllib2.urlopen(url).read()
    #print "The pdf file has %d bytes" % len(pdfdata)
    
    xmldata = scraperwiki.pdftoxml(pdfdata)
    #print "After converting to xml it has %d bytes" % len(xmldata)
    print "The first 5000 characters are: ", xmldata[:5000]
    
    root = lxml.etree.fromstring(xmldata)
    pages = list(root)
    print "The pages are numbered:", [ page.attrib.get("number")  for page in pages ]

    arquivo = url.replace("https://www.fazenda.sp.gov.br/SigeoLei131/Paginas/Arquivos/", "")
    cabecalho = True

    for page in pages: #[:1]
        data = {}
        conta = 0
        coluna = 0
        for el in list(page)[:110]: #[:100]
            if el.tag == "text":
                data[colunas[coluna]] = el.text.strip()
                coluna = coluna + 1
                if coluna >= len(colunas):
                    if not cabecalho:
                        data['arquivo'] = arquivo
                        #print el.attrib['left'], el.text
                        if conta < 10: print data
                        scraperwiki.datastore.save(["arquivo", "nome", "cargo", "municipio"], data)
                        conta = conta + 1
                        data = {}

                    cabecalho = False
                    coluna = 0

        print "Pagina %s: %s registro(s)" % (page.attrib.get("number"), conta)

示例#54

0

显示文件

def main():
    url="http://www.freedomhouse.org/sites/default/files/Freedom%20OnThe%20Net_Full%20Report.pdf"
    pdfdata = urllib2.urlopen(url).read()
    xmldata = scraperwiki.pdftoxml(pdfdata)
    
    goodpages=[27,28,29]

    rootdata=lxml.etree.fromstring(xmldata)
    pages = list(rootdata)

    page=pages[23]
    alltext=getText(page)
    dict1(alltext)
    for i in goodpages:
        page=pages[i]
        alltext=getText(page)
        dict2(alltext)
    
        pagenumbers=[30,36,47,53, 57, 61, 65, 72, 78, 82, 87, 97, 102, 108]
        Country=['Brazil', 'China', 'Cuba', 'Egypt', 'Estonia', 'Georgia', 'India', 'Iran', 'Kenya', 'Malaysia','Russia','Tunisia', 'Turkey', 'UK']
    for i in range(len(pagenumbers)):
        page=pages[pagenumbers[i]]
        alltext=getText(page)
        PageInfo(alltext, Country[i])

示例#55

0

显示文件

文件： pdf_10_11.py 项目： yuandra/scraperwiki-scraper-vault

import scraperwiki
import urllib2
import lxml.etree
import bs4

url = "http://dget.nic.in/ItiUpgradePPP/list%20of%20%20ITIs%20only%20wth%20industry%20partners10-11.pdf"
pdfdata = urllib2.urlopen(url).read()
#print "The pdf file has %d bytes" % len(pdfdata)

xmldata = scraperwiki.pdftoxml(pdfdata)
#print "After converting to xml it has %d bytes" % len(xmldata)
#print "The first 2000 characters are: ", xmldata[:2000]

root = lxml.etree.fromstring(xmldata)
print xmldata
soup = bs4.BeautifulSoup(xmldata)
#print soup
start = False
ITI = True
sl_no = 0
for link in soup.find_all('text'):
    #print link.textcontent()
    #print link.get_text()
    #print str(start)
    text = link.get_text()
    text = text.replace(',', ' ')
    if start:
        if len(text) > 4 or text.count('NIL') > 0:
            #print text
            #if text.count('(ITI-')>0:
            #continue

示例#56

0

显示文件

    geocode = simplejson.loads(geo_response.read())
    print geocode_url
    print geocode
    #Google imposes query limits, this lets us pass a failure and have the loop sleep and try again after 2 seconds
    if geocode['status'] == "OVER_QUERY_LIMIT":
        return 0
    if geocode['status'] != 'ZERO_RESULTS':
        coord_lat = geocode['results'][0]['geometry']['location']['lat']
        coord_lon = geocode['results'][0]['geometry']['location']['lng']
        coord.append(coord_lat)
        coord.append(coord_lon)
    print coord
    return coord

url = "https://www.denvergov.org/Portals/707/documents/mydenverdrive/1-22-25-2013.pdf"
xml = scraperwiki.pdftoxml(urllib2.urlopen(url).read())
parsed = BeautifulSoup(xml).text.split("\n")
filtered_list = parsed[parsed.index('Location: '):]
closures = []

i = 0
current_closure = -1

while i < len(filtered_list):
    text = filtered_list[i]
    if text == "Location: ":
        closures.append({})
        current_closure = len(closures) - 1
        i += 1
        closures[current_closure]['location'] = filtered_list[i]
        #print filtered_list[i]

示例#57

0

显示文件

文件： nike_contract_factory_disclosure_list_2011.py 项目： yuandra/scraperwiki-scraper-vault

# Blank Python
import sys
import scraperwiki
import urllib
import lxml.etree, lxml.html
import re

# for the geocode
from geopy import geocoders

import json

pdfurl = "http://www.nikebiz.com/responsibility/documents/factory_disclosure_list.pdf"

pdfdata = urllib.urlopen(pdfurl).read()
pdfxml = scraperwiki.pdftoxml(pdfdata)
root = lxml.etree.fromstring(pdfxml)

g = geocoders.Google(
    'ABQIAAAAJWpc-texCflE7mMP0dgMGRTudD1_fegkcYIvU14JimqYoyT2khRxYTlCvIBPJApaoqvk4JfEfbrhyg'
)

for page in root:
    assert page.tag == 'page'
    #print "page details", page.attrib
    pagelines = {}
    pagedata = {}
    for v in page:
        if v.tag == 'text':
            text = re.match(
                '(?s)<text.*?>(.*?)</text>', lxml.etree.tostring(v)).group(

示例#58

0

显示文件

from urllib2 import urlopen
from lxml.html import fromstring, tostring
import lxml.etree

def get_pdf_list():
    raw = urlopen('http://www.dropbox.com/sh/gpi0ejooop07x8a/bMDz4s9Ixp').read()
    html = fromstring(raw)
    
    a_elements = html.cssselect('li.browse-file.list-view-cols div.filename-col a')
    pdf_urls = [a.attrib['href'] + '?dl=1' for a in a_elements]
    return pdf_urls

test_url = 'https://www.dropbox.com/sh/gpi0ejooop07x8a/qJxWkjx8fz/ENDA-HR1858-June1997.pdf?dl=1'

raw_pdf = urlopen(test_url).read()
pdfxml = lxml.etree.fromstring(pdftoxml(raw_pdf))

rawtext = pdfxml.xpath('string()').replace('\n', ' ')
print rawtextfrom scraperwiki import pdftoxml
from urllib2 import urlopen
from lxml.html import fromstring, tostring
import lxml.etree

def get_pdf_list():
    raw = urlopen('http://www.dropbox.com/sh/gpi0ejooop07x8a/bMDz4s9Ixp').read()
    html = fromstring(raw)
    
    a_elements = html.cssselect('li.browse-file.list-view-cols div.filename-col a')
    pdf_urls = [a.attrib['href'] + '?dl=1' for a in a_elements]
    return pdf_urls

示例#59

0

显示文件

文件： extract.py 项目： bevennyamande/dpkg-zimbabwe-census-2012

def get_root(filename):
  f=open(filename,"rb")
  return lxml.html.fromstring(scraperwiki.pdftoxml("".join(f)))

示例#60

0

显示文件

import scraperwiki
from BeautifulSoup import BeautifulSoup
import time
import urllib

urltemplate = 'http://www.fco.gov.uk/resources/en/protocol/ldl-'
date = time.strftime('%A %d %B %Y')
month = (date.split(' '))[2]
year = (date.split(' '))[3]

#url = urltemplate + month + year
url = 'http://www.fco.gov.uk/resources/en/protocol/ldl-August2010'
pdfinput = urllib.urlopen(url)
print 'got pdf'
scraped = scraperwiki.pdftoxml(pdfinput.read())
print 'pdftohtml complete'
output = []


def getlastrow():
    l = len(output)
    r = output[(l)]
    return r


print 'finished setup'
soup = BeautifulSoup(scraped)
print 'soup cooked'
# this document is a right dog