def runTest(self): expected = """ <pdf2xml> <page number="1" width="612" height="792"> <text top="71" left="56" width="341" height="15">This is a unicode test, ŠE ČE ŽE, če že še, đon Đon. Bä ĕř. Öl. Löüãñ.</text> <text top="71" left="324" width="7" height="17">ȩ</text> </page> </pdf2xml> """ pdf_fn = datafile("test2.pdf") from pypdf2xml import pdf2xml from StringIO import StringIO xml = pdf2xml(StringIO(open(pdf_fn, "rb").read())) got_lines = [i.strip() for i in xml.split("\n") if i.strip()] expected_lines = [i.strip() for i in expected.split("\n") if i.strip()] self.assertEqual(got_lines, expected_lines)
def tracta_fitxer(fitxer, debug_alumne=None): s = pdf2xml(open(fitxer, 'rb')) root = etree.fromstring(s.replace('\n', '')) for bloc_alumne in split_blocs_alumnes(root): alumne = dades_alumne(bloc_alumne) mps = split_blocs_mps(bloc_alumne) if debug_alumne: if debug_alumne in alumne['nom']: return bloc_alumne, mps else: for mp in mps: for uf in mp['ufs']: nota, hores = cuina_nota(uf['nota_raw']) r = u"{}|{}|{}|{}|{}|{}".format( fitxer.split(".")[0], alumne['nom'], mp['nom'], uf['uf'], nota, hores) print r
def runTest(self): expected = """ <pdf2xml> <page number="1" width="612" height="792"> <text top="71" left="56" width="341" height="15">This is a unicode test, ŠE ČE ŽE, če že še, đon Đon. Bä ĕř. Öl. Löüãñ.</text> <text top="71" left="324" width="7" height="17">ȩ</text> </page> </pdf2xml> """ pdf_fn = datafile('test2.pdf') from pypdf2xml import pdf2xml from StringIO import StringIO xml = pdf2xml(StringIO(open(pdf_fn, 'rb').read())) got_lines = [i.strip() for i in xml.split('\n') if i.strip()] expected_lines = [i.strip() for i in expected.split('\n') if i.strip()] self.assertEqual(got_lines, expected_lines)
def runTest(self): expected = """ <pdf2xml> <page number="1" width="612" height="792"> <text top="71" left="56" width="87" height="15">This is a test PDF.</text> </page> </pdf2xml> """ pdf_fn = datafile('test1.pdf') from pypdf2xml import pdf2xml from StringIO import StringIO xml = pdf2xml(StringIO(open(pdf_fn, 'rb').read())) got_lines = [i.strip() for i in xml.split('\n') if i.strip()] expected_lines = [i.strip() for i in expected.split('\n') if i.strip()] self.assertEqual(got_lines, expected_lines)
def runTest(self): expected = """ <pdf2xml> <page number="1" width="612" height="792"> <text top="71" left="56" width="87" height="15">This is a test PDF.</text> </page> </pdf2xml> """ pdf_fn = datafile("test1.pdf") from pypdf2xml import pdf2xml from StringIO import StringIO xml = pdf2xml(StringIO(open(pdf_fn, "rb").read())) got_lines = [i.strip() for i in xml.split("\n") if i.strip()] expected_lines = [i.strip() for i in expected.split("\n") if i.strip()] self.assertEqual(got_lines, expected_lines)
translate = {"ff":"ff","“":'"',"’’":'"',"":"","":""} # import yaml from pypdf2xml import pdf2xml from bs4 import BeautifulSoup as bs def attribuer(string, noeud, attributs): """ string est l'élément qui sera mis entre balises. noeud sera le nom du noeud dans lequel sera string attributs est un dictionnaire. """ nouage = "<{0} {1}>{2}</{0}>" attrs = yaml.dump(attributs).strip("\n{}").replace(":","=").replace(",","") return nouage.format(noeud,attrs,string) fichier_in = sys.argv[1] (sortie,sep,extension) = fichier_in.rpartition('.') fichier_xml = pdf2xml(open(fichier_in,"r")).splitlines() for ligne in fichier_xml: print(ligne) #soup = bs(fichier_xml,"xml") #print(soup.prettify()) #with codecs.open(sortie+sep+"xml","w") as ecriture: #for ligne in fichier_xml: #ecriture.write(ligne+"\n")
import sys import csv from pypdf2xml import pdf2xml from bs4 import BeautifulSoup if len(sys.argv) != 2: print """usage: to see the output: \tpdf2xml file.pdf to write output to file: \tpdf2xml file.pdf > outfile.xml """ else: docinfo = pdf2xml(open(sys.argv[1],'rb')) soup = BeautifulSoup(docinfo, "lxml") arr = [] for page in soup.find_all(width="612", height="792"): dic = {} name = page.find(top="182", left="72") dic["Name"] = name.get_text() amount = page.find(top="74", left="394") dic["Amount"] = amount.get_text() deg = page.find(top="182", left="394") dic["Degrees"] = deg.get_text() address1 = page.find(top="219", left="176") if address1: address_1 = address1.get_text()