Exemplo n.º 1
0
def fixture(filename):
	"""
	Obtain a PDFDocument for fixtures/sample_data/{filename}, memoizing the
	return result.
	"""
	global memoized

	if filename in memoized:
		return memoized.get(filename)
	here = abspath(dirname(__file__))
	fn = pjoin(here, "..", "fixtures", "sample_data", filename)
	fd = open(fn, "rb")
	memoized[filename] = PDFDocument(fd)
	return memoized[filename]
Exemplo n.º 2
0
def render_pdf(pdf_filename):
    with open(pdf_filename, "rb") as fd:

        doc = PDFDocument.from_fileobj(fd)

        for page_number, page in enumerate(doc.get_pages()):
            svg_file = "svgs/{0}_{1:02d}.svg".format(basename(pdf_filename), page_number)
            png_file = "pngs/{0}_{1:02d}.png".format(basename(pdf_filename), page_number)

            table_container = page_to_tables(page)
            annotations = make_annotations(table_container)

            render_page(pdf_filename, page_number, annotations, svg_file, png_file)

            print "Rendered", svg_file, png_file
Exemplo n.º 3
0
def render_pdf(pdf_filename):
    with open(pdf_filename, "rb") as fd:

        doc = PDFDocument.from_fileobj(fd)

        for page_number, page in enumerate(doc.get_pages()):
            svg_file = 'svgs/{0}_{1:02d}.svg'.format(basename(pdf_filename),
                                                     page_number)
            png_file = 'pngs/{0}_{1:02d}.png'.format(basename(pdf_filename),
                                                     page_number)

            table_container = page_to_tables(page)
            annotations = make_annotations(table_container)

            render_page(pdf_filename, page_number, annotations, svg_file,
                        png_file)

            print "Rendered", svg_file, png_file
Exemplo n.º 4
0
def check(path):
    fileobj = open(path, "rb")
    doc = PDFDocument.from_fileobj(fileobj)
    tables = pdftables.page_to_tables(doc.get_page(0))
    print tables
Exemplo n.º 5
0
#pagenumber = 1

#SelectedPDF = "pdf_prc_prod_1_7_1288_acucar-vhp-vendido-mercado-externo_sao-paulo_mensal.pdf"
#pagenumber = 1

#SelectedPDF = "commodity-prices_en.pdf"
#pagenumber = 1

#SelectedPDF = "AnimalExampleTables.pdf" # 7 pages works fine in pdfminer, 4 for first test 2012.01.PosRpt.pdf
#pagenumber = 2

filepath = os.path.join(PDF_TEST_FILES, SelectedPDF)
pta.plotAllPages(fh)


doc = PDFDocument(open(filepath, 'rb'))
pdf_page = doc.get_page(pagenumber)

table, diagnosticData = page_to_tables(
    pdf_page, ConfigParameters(
        extend_y=False,
        table_top_hint=table_top_hint,
        table_bottom_hint=table_bottom_hint,
        atomise=False))

fig, ax1 = pta.plotpage(diagnosticData)

result = StringIO()
(columns, rows) = get_dimensions(table)
result.write("     {} columns, {} rows\n".format(columns, rows))
Exemplo n.º 6
0
def check(path):
    fileobj = open(path, 'rb')
    doc = PDFDocument.from_fileobj(fileobj)
    tables = pdftables.page_to_tables(doc.get_page(0))
    print tables
Exemplo n.º 7
0
from pdftables.pdf_document import PDFDocument as pdfdoc
from pdftables.pdftables import page_to_tables
from pdftables.display import to_string

filepath = 'irregular-verbs-de.pdf'
fileobj = open(filepath, 'rb')

doc = pdfdoc.from_fileobj(fileobj)

page = doc.get_page(0)
tables = page_to_tables(page)
for table in tables:
    print to_string(table.data)
Exemplo n.º 8
0
filepath = 'CBSinglePage.pdf'
fileobj = open(filepath, 'rb')
# Then we create a PDF element from the file object:

import pdftables

from pdftables.pdf_document import PDFDocument
doc = PDFDocument.from_fileobj(fileobj)
#Then we use the get_page() method to select a single page from the document:

from pdftables.pdftables import page_to_tables
page = doc.get_page(12)
tables = page_to_tables(page)

#Now you have a TableContainer object, you can convert it to ASCII for quick previewing:

from pdftables.display import to_string
for table in tables:
    print to_string(table.data)
Exemplo n.º 9
0
#fh = pdftables.filehandleFromURL("http://www.candyusa.com/files/1st%20qtr%202013%20report.pdf")
#pagenumber = 1

#SelectedPDF = "pdf_prc_prod_1_7_1288_acucar-vhp-vendido-mercado-externo_sao-paulo_mensal.pdf"
#pagenumber = 1

#SelectedPDF = "commodity-prices_en.pdf"
#pagenumber = 1

#SelectedPDF = "AnimalExampleTables.pdf" # 7 pages works fine in pdfminer, 4 for first test 2012.01.PosRpt.pdf
#pagenumber = 2

filepath = os.path.join(PDF_TEST_FILES, SelectedPDF)
pta.plotAllPages(fh)

doc = PDFDocument(open(filepath, 'rb'))
pdf_page = doc.get_page(pagenumber)

table, diagnosticData = page_to_tables(
    pdf_page,
    ConfigParameters(extend_y=False,
                     table_top_hint=table_top_hint,
                     table_bottom_hint=table_bottom_hint,
                     atomise=False))

fig, ax1 = pta.plotpage(diagnosticData)

result = StringIO()
(columns, rows) = get_dimensions(table)
result.write("     {} columns, {} rows\n".format(columns, rows))
Exemplo n.º 10
0
from pdftables.pdf_document import PDFDocument as pdfdoc
from pdftables.pdftables import page_to_tables
from pdftables.display import to_string


filepath = 'irregular-verbs-de.pdf'
fileobj = open(filepath, 'rb')

doc = pdfdoc.from_fileobj(fileobj)

page = doc.get_page(0)
tables = page_to_tables(page)
for table in tables:
  print to_string(table.data)