Пример #1
29
    def runTest(self):
        expected = """
        <pdf2xml>
            <page number="1" width="612" height="792">
                <text top="71" left="56" width="341" height="15">This is a unicode test, ŠE ČE ŽE, če že še, đon Đon. Bä ĕř. Öl. Löüãñ.</text>
                <text top="71" left="324" width="7" height="17">ȩ</text>
            </page>
        </pdf2xml>
        """

        pdf_fn = datafile("test2.pdf")
        from pypdf2xml import pdf2xml
        from StringIO import StringIO

        xml = pdf2xml(StringIO(open(pdf_fn, "rb").read()))

        got_lines = [i.strip() for i in xml.split("\n") if i.strip()]
        expected_lines = [i.strip() for i in expected.split("\n") if i.strip()]
        self.assertEqual(got_lines, expected_lines)
Пример #2
0
def tracta_fitxer(fitxer, debug_alumne=None):
    s = pdf2xml(open(fitxer, 'rb'))
    root = etree.fromstring(s.replace('\n', ''))
    for bloc_alumne in split_blocs_alumnes(root):
        alumne = dades_alumne(bloc_alumne)
        mps = split_blocs_mps(bloc_alumne)
        if debug_alumne:
            if debug_alumne in alumne['nom']:
                return bloc_alumne, mps
        else:
            for mp in mps:
                for uf in mp['ufs']:
                    nota, hores = cuina_nota(uf['nota_raw'])
                    r = u"{}|{}|{}|{}|{}|{}".format(
                        fitxer.split(".")[0], alumne['nom'], mp['nom'],
                        uf['uf'], nota, hores)
                    print r
Пример #3
0
    def runTest(self):
        expected = """
        <pdf2xml>
            <page number="1" width="612" height="792">
                <text top="71" left="56" width="341" height="15">This is a unicode test, ŠE ČE ŽE, če že še, đon Đon. Bä ĕř. Öl. Löüãñ.</text>
                <text top="71" left="324" width="7" height="17">ȩ</text>
            </page>
        </pdf2xml>
        """

        pdf_fn = datafile('test2.pdf')
        from pypdf2xml import pdf2xml
        from StringIO import StringIO
        xml = pdf2xml(StringIO(open(pdf_fn, 'rb').read()))

        got_lines = [i.strip() for i in xml.split('\n') if i.strip()]
        expected_lines = [i.strip() for i in expected.split('\n') if i.strip()]
        self.assertEqual(got_lines, expected_lines)
Пример #4
0
    def runTest(self):

        expected = """
        <pdf2xml>
            <page number="1" width="612" height="792">
                <text top="71" left="56" width="87" height="15">This is a test PDF.</text>
            </page>
        </pdf2xml>
        """

        pdf_fn = datafile('test1.pdf')
        from pypdf2xml import pdf2xml
        from StringIO import StringIO
        xml = pdf2xml(StringIO(open(pdf_fn, 'rb').read()))

        got_lines = [i.strip() for i in xml.split('\n') if i.strip()]
        expected_lines = [i.strip() for i in expected.split('\n') if i.strip()]
        self.assertEqual(got_lines, expected_lines)
Пример #5
0
    def runTest(self):

        expected = """
        <pdf2xml>
            <page number="1" width="612" height="792">
                <text top="71" left="56" width="87" height="15">This is a test PDF.</text>
            </page>
        </pdf2xml>
        """

        pdf_fn = datafile("test1.pdf")
        from pypdf2xml import pdf2xml
        from StringIO import StringIO

        xml = pdf2xml(StringIO(open(pdf_fn, "rb").read()))

        got_lines = [i.strip() for i in xml.split("\n") if i.strip()]
        expected_lines = [i.strip() for i in expected.split("\n") if i.strip()]
        self.assertEqual(got_lines, expected_lines)
Пример #6
0
translate = {"ff":"ff","“":'"',"’’":'"',"":"","":""}

#
import yaml
from pypdf2xml import pdf2xml
from bs4 import BeautifulSoup as bs

def attribuer(string, noeud, attributs):
	"""
		string est l'élément qui sera mis entre balises.
		noeud sera le nom du noeud dans lequel sera string
		attributs est un dictionnaire.
		
	"""
	nouage = "<{0} {1}>{2}</{0}>"
	attrs = yaml.dump(attributs).strip("\n{}").replace(":","=").replace(",","")
	return nouage.format(noeud,attrs,string)

fichier_in = sys.argv[1]
(sortie,sep,extension) = fichier_in.rpartition('.')

fichier_xml = pdf2xml(open(fichier_in,"r")).splitlines()
for ligne in fichier_xml:
	print(ligne)
#soup = bs(fichier_xml,"xml")
#print(soup.prettify())
#with codecs.open(sortie+sep+"xml","w") as ecriture:
	#for ligne in fichier_xml:
		#ecriture.write(ligne+"\n")
Пример #7
0
import sys
import csv
from pypdf2xml import pdf2xml
from bs4 import BeautifulSoup


if len(sys.argv) != 2:
    print """usage:
to see the output:
\tpdf2xml file.pdf
to write output to file:
\tpdf2xml file.pdf > outfile.xml
"""
else:
    docinfo = pdf2xml(open(sys.argv[1],'rb'))

soup = BeautifulSoup(docinfo, "lxml")

arr = []

for page in soup.find_all(width="612", height="792"):
	dic = {}
	name = page.find(top="182", left="72")
	dic["Name"] = name.get_text()
	amount = page.find(top="74", left="394")
	dic["Amount"] = amount.get_text()
	deg = page.find(top="182", left="394")
	dic["Degrees"] = deg.get_text()
	address1 = page.find(top="219", left="176")	
	if address1:
		address_1 = address1.get_text()