Python process_page示例，pdftableextract.process_page Python示例

示例#1

0

显示文件

	def tables(self):
		with env(save=True):
			if self.is_pdf:
				d = hashlib.md5()
				d.update(self.open().read())
				h = d.hexdigest()
				
				with meta(env.get("meta")) as g:
					h2 = g.value(rdflib.URIRef(self.url), NS1["md5"])
					if not h2 or h != h2.value:
						for f in glob.glob(re.sub(r".pdf", "_*.csv", self.local)):
							os.remove(f)
						
						out = subprocess.check_output(("pdfinfo", self.local)).decode(sys.stdout.encoding)
						pages = None
						for l in io.StringIO(out):
							m = re.match(r"^Pages:\s*(\d+)", l)
							if m:
								pages = int(m.group(1))
						assert pages
						for i in range(pages):
							p = i+1
							try:
								cells = pte.process_page(self.local, str(p), whitespace="raw")
							except:
								print(self.local, p, i)
								raise
								
							out = re.sub(r".pdf", "_%d.csv" % i, self.local)
							try:
								pte.output(cells, i, table_csv_filename=out)
							except:
								logging.error(self.local, exc_info=True)
								raise
						
						g.set((rdflib.URIRef(self.url), NS1["md5"], rdflib.Literal(h)))
					
					for f in glob.glob(re.sub(r".pdf", "_*.csv", self.local)):
						yield list(csv.reader(open(f)))
			else:
				def txt(obj):
					if obj is None:
						return ""
					return obj.text_content()
				
				root = self.root
				if root is None:
					return iter([])
				
				css = self.env.get("css", None)
				if css is None:
					for tb in root.cssselect("table"):
						yield Table(tb).matrix(txt)
				else:
					for main in root.cssselect(css):
						for tb in main.cssselect("table"):
							yield Table(tb).matrix(txt)

示例#2

0

显示文件

#coding: utf-8

import pandas as pd
import pdftableextract as pdf

pages = ["95"]
#pages = ["84"]

#cells = [pdf.process_page("example.pdf", p) for p in pages]
#cells = [pdf.process_page("./2015_601628.pdf", p) for p in pages]
cells = [pdf.process_page("./2015_002594.pdf", p) for p in pages]
#cells = [pdf.process_page("./2015_601766.pdf", p) for p in pages]
#cells = [pdf.process_page("table.pdf", p) for p in pages]
#cells = [pdf.process_page("table_up.pdf", p) for p in pages]
#cells = [pdf.process_page("table_down.pdf", p) for p in pages]
#cells = [pdf.process_page("table_down_1.pdf", p) for p in pages]
#exit(0)

#cells: [[(col, row, ?, ?, ?, "content of the cell"), (col, row, ?, ?, ?, "content of the cell"),..., (col, row, ?, ?, ?, "content of the cell")]]
#print "type(cells): ", type(cells)	#<type 'list'> list of list
#print "cells: ", cells	#lxw NOTE: <col, row, colspan?, ?, ?, "content of the cell">, what do the last three "?" mean? t

"""
print "cells:"
for cell in cells:	#cells: list of list
	print "len(cells) == 1? ", len(cells)
	for item in cell:	#cell: list of tuple
		tempStr = ""
		for element in item: #item: tuple
			tempStr += str(element) + ", "
		print tempStr

示例#3

0

显示文件

文件： shinseido_pdftocsv.py 项目： hkwi/our-data

import csv
import pdftableextract as pte
import csv
import os
import os.path
import glob

for f in glob.glob("www.city.kobe.lg.jp/child/grow/shinseido/img/*.pdf"):
	fn = os.path.basename(f)
	out = "shinseido/%s.ttl" % fn
	if os.path.exists(out) and os.stat(out).st_mtime >= os.stat(f).st_mtime:
		continue
	
	info = [r for r in csv.DictReader(open("shinseido_meta/index.csv", encoding="UTF-8"))
		if r["file"] == fn]
	assert info, fn
	
	pages = [int(info[0][k]) for k in ("start","end")]
	assert pages, fn
	
	for page in range(*pages):
		a = pte.process_page(f, str(page))
		x = pte.table_to_list(a, page)
		with open("shinseido/%s.p%02d.csv" % (fn, page), "w") as o:
			w = csv.writer(o)
			w.writerows(x[-1])

示例#4

0

显示文件

文件： test_to_pandas.py 项目： JoshBradshaw/pdf-table-extract

import pandas as pd
import pdftableextract as pdf

pages = ["1"]
cells = [pdf.process_page("example.pdf",p) for p in pages]

#flatten the cells structure
cells = [item for sublist in cells for item in sublist ]

#without any options, process_page picks up a blank table at the top of the page.
#so choose table '1'
li = pdf.table_to_list(cells, pages)[1]

#li is a list of lists, the first line is the header, last is the footer (for this table only!)
#column '0' contains store names
#row '1' contains column headings
#data is row '2' through '-1'

data =pd.DataFrame(li[2:-1], columns=li[1], index=[l[0] for l in li[2:-1]])

示例#5

0

显示文件

文件： test_to_pandas.py 项目： frankcode101/PDFProcessing

import pandas as pd
import pdftableextract as pdf

pages = ["1"]
cells = [pdf.process_page("example.pdf",p) for p in pages]

#flatten the cells structure
cells = [item for sublist in cells for item in sublist ]

#without any options, process_page picks up a blank table at the top of the page.
#so choose table '1'
li = pdf.table_to_list(cells, pages)[1]

#li is a list of lists, the first line is the header, last is the footer (for this table only!)
#column '0' contains store names
#row '1' contains column headings
#data is row '2' through '-1'

data =pd.DataFrame(li[2:-1], columns=li[1], index=[l[0] for l in li[2:-1]])
print data

示例#6

0

显示文件

文件： domestic_tables_extract_multi.py 项目： akmiller01/alexm-util

def search((inputFile,search)):
    baseName = os.path.basename(inputFile)
    inputName, inputExtension = os.path.splitext(baseName)
    print inputName
    regionName = inputFile.split("/")[-2]
    fr = open(inputFile, 'rb')
    try:
        pdf = pyPdf.PdfFileReader(fr)
        pages = pdf.getNumPages()
        count = 0
        hits = []
        for page in range(0,pages):
            spin()
            pdf_page = pdf.getPage(page) 
            text = pdf_page.extractText()
            if text.find(search)>-1:
                if count < 2:
                    hits.append(str(page+1))
                    count += 1
        fr.close()
        cells = [pdfextract.process_page(inputFile,p) for p in hits]
        cells = [item for sublist in cells for item in sublist ]
        li = pdfextract.table_to_list(cells, hits)
        if len(li)>1:
            table1 = li[-2][1:-1]
            table2 = li[-1][1:-1]
            data = []
            for row in table1:
                spin()
                parsedText = re.findall("[^0-9]{2,}",row[0])
                if len(parsedText)>1:
                    for i in range(0,len(parsedText)):
                        text = parsedText[i]
                        startIndex = row[0].index(text)+len(text)
                        endIndex = row[0].index(parsedText[i+1]) if i+1<len(parsedText) else len(row[0])
                        parsedNum = row[0][startIndex:endIndex].split(" ")
                        parsedRow = []
                        parsedRow.append(text.strip())
                        parsedRow.extend(parsedNum)
                        if(len(parsedRow)>1):
                            parsedRow.insert(0,inputName)
                            parsedRow.insert(0,regionName)
                            data.append(parsedRow)
                elif len(parsedText)==1:
                    parsedRow = []
                    parsedRow.append(parsedText[0].strip())
                    parsedNum = re.findall("[0-9.,]+",row[0][len(parsedText[0]):])
                    parsedRow.extend(parsedNum)
                    if(len(parsedRow)>1):
                        parsedRow.insert(0,inputName)
                        parsedRow.insert(0,regionName)
                        data.append(parsedRow)
                else:
                    parsedRow = []
                    parsedRow.append("")
                    parsedNum = re.findall("[0-9.,]+",row[0])
                    parsedRow.extend(parsedNum)
                    if(len(parsedRow)>1):
                        parsedRow.insert(0,inputName)
                        parsedRow.insert(0,regionName)
                        data.append(parsedRow)
            for row in table2:
                spin()
                if row[0].find("Page")==-1:
                    parsedText = re.findall("[^0-9]{2,}",row[0])
                    if len(parsedText)>1:
                        for i in range(0,len(parsedText)):
                            text = parsedText[i]
                            startIndex = row[0].index(text)+len(text)
                            endIndex = row[0].index(parsedText[i+1]) if i+1<len(parsedText) else len(row[0])
                            parsedNum = row[0][startIndex:endIndex].split(" ")
                            parsedRow = []
                            parsedRow.append(text.strip())
                            parsedRow.extend(parsedNum)
                            if(len(parsedRow)>1):
                                parsedRow.insert(0,inputName)
                                parsedRow.insert(0,regionName)
                                data.append(parsedRow)
                    elif len(parsedText)==1:
                        parsedRow = []
                        parsedRow.append(parsedText[0].strip())
                        parsedNum = re.findall("[0-9.,]+",row[0][len(parsedText[0]):])
                        parsedRow.extend(parsedNum)
                        if(len(parsedRow)>1):
                            parsedRow.insert(0,inputName)
                            parsedRow.insert(0,regionName)
                            data.append(parsedRow)
                    else:
                        parsedRow = []
                        parsedRow.append("")
                        parsedNum = re.findall("[0-9.,]+",row[0])
                        parsedRow.extend(parsedNum)
                        if(len(parsedRow)>1):
                            parsedRow.insert(0,inputName)
                            parsedRow.insert(0,regionName)
                            data.append(parsedRow)
            cols = ["Region","District","Sector/MDA/MMDA","Central GOG and CF: Comp of Emp","Central GOG and CF: Goods/Service","Central GOG and CF: Assets (Capital)","Central GOG and CF: Total","IGF: Comp of Emp","IGF: Goods/Service","IGF: Assets (Capital)","IGF: Total","Funds/Others: Comp of Emp","Funds/Others: Goods/Service","Funds/Others: Assets (Capital)","Funds/Others: Total","Donor: Comp of Emp","Donor: Goods/Service","Donor: Assets (Capital)","Donor: Total","Grand Total Less NREG / Statutory"]
            data = pd.DataFrame(data,columns=cols)
            return data
        else:
            cols = ["Region","District","Sector/MDA/MMDA","Central GOG and CF: Comp of Emp","Central GOG and CF: Goods/Service","Central GOG and CF: Assets (Capital)","Central GOG and CF: Total","IGF: Comp of Emp","IGF: Goods/Service","IGF: Assets (Capital)","IGF: Total","Funds/Others: Comp of Emp","Funds/Others: Goods/Service","Funds/Others: Assets (Capital)","Funds/Others: Total","Donor: Comp of Emp","Donor: Goods/Service","Donor: Assets (Capital)","Donor: Total","Grand Total Less NREG / Statutory"]
            data = [[regionName,inputName,"ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR"]]
            data = pd.DataFrame(data,columns=cols)
            return data
    except:
        cols = ["Region","District","Sector/MDA/MMDA","Central GOG and CF: Comp of Emp","Central GOG and CF: Goods/Service","Central GOG and CF: Assets (Capital)","Central GOG and CF: Total","IGF: Comp of Emp","IGF: Goods/Service","IGF: Assets (Capital)","IGF: Total","Funds/Others: Comp of Emp","Funds/Others: Goods/Service","Funds/Others: Assets (Capital)","Funds/Others: Total","Donor: Comp of Emp","Donor: Goods/Service","Donor: Assets (Capital)","Donor: Total","Grand Total Less NREG / Statutory"]
        data = [[regionName,inputName,"ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR"]]
        data = pd.DataFrame(data,columns=cols)
        return data

示例#7

0

显示文件

文件： medicament.py 项目： alej0varas/bioequivalencia

def transform(filename, console=None):
    write(console, "  - Parsing...", ending='')

    tablePdf   = pdf.PdfFileReader (file(filename, 'rb'))
    pages      = [ str(p) for p in range(1, len(tablePdf.pages) + 1)]

    # Procesing cells and flattern cells structure
    cells       = [pdftable.process_page(filename, p) for p in pages]
    cells       = [item for sublist in cells for item in sublist]

    table       = []

    write(console, "done")

    for page_table in pdftable.table_to_list(cells, pages):
 
        row_msg_format = "\r\033[K  - %d/%d records transformed"       
 
        page_table_rows = len(page_table)
        idx             = 0
 
        while idx < page_table_rows:
            write(console, row_msg_format % (idx, page_table_rows), ending='')

            cell_len       = sum([len(i) for i in page_table[idx]])

            if (cell_len == 0) or is_header_text(page_table[idx][0]):
                del page_table[idx]
                page_table_rows -= 1
            else:
                # Unicode for all
                for subidx in xrange(len(page_table[idx])):
                    page_table[idx][subidx] = unicode(page_table[idx][subidx], encoding='utf-8').strip()

                # Cases "1. medicament one 2. medicament two"
                if re.search(r'\d\.\W\W', page_table[idx][3]):
                    re_split_cases    = r'\W?\d+\.\W\W'
                    splitted_products = re.split(re_split_cases, page_table[idx][3])[1:]
                    splitted_vendors  = re.split(re_split_cases, page_table[idx][4])[1:]
                    added_products    = len(splitted_products)

                    if not len(splitted_vendors):
                        splitted_vendors = [ page_table[idx][4] ] * added_products

                    for i in xrange(0, added_products):
                        if i > 0: 
                            row_copy         = list(page_table[idx])
                            idx             += 1
                            page_table_rows += 1

                            page_table.insert(idx, row_copy)

                        page_table[idx][3] = unicode(splitted_products[i]).encode('utf-8')
                        page_table[idx][4] = unicode(splitted_vendors[i]).encode('utf-8')

                idx             += 1

        table += page_table

        write(console, row_msg_format % (idx, page_table_rows), ending='')
        
    write(console, '')

    return table

示例#8

0

显示文件

# Get current date and time:
gentime = datetime.datetime.now()

# Querry server with authentification
pdfurl = "https://gho.berlin/wp-content/frei_stunden/VPS.pdf"
response = requests.get(pdfurl, auth=HTTPBasicAuth("<username>", "<password>"))
pdffilename = "vertretungsplan.pdf"

# extract table
outputfile = open(pdffilename, "wb")
outputfile.write(response.content)
outputfile.close

pages = ["2"]
cells = [pdf.process_page(pdffilename, p) for p in pages]

os.remove(pdffilename)

# flatten the cells structure
cells = [item for sublist in cells for item in sublist]

li = pdf.table_to_list(cells, pages)[2]

# Write Json into file
jsonfilename = "vertretungsplan.json"

if os.path.isfile(jsonfilename):
    # Remove old versions of the file
    os.remove(jsonfilename)

示例#9

0

显示文件

文件： pdf_table_extract.py 项目： ferhatelmas/docker-pdf-table-extract

def convert_pdf_to_text(src, dest):
    cells = pdf.process_page(src, "1")
    with open(dest, "w") as f:
        for cell in cells:
            f.write(cell[5])
            f.write("\n")

示例#10

0

显示文件

文件： getresult.py 项目： balasankarc/ResultGrabber

    def process(self, start, end, parentfolder):
        '''
        This method processes the specified results and populate necessary data
        structures.
        '''
        self.badresult = []
        self.registers = {}
        self.subjects = {}
        result_pdf_path = os.path.join(parentfolder, 'Results')
        for count in range(start, end + 1):
            try:
                pages = ["1"]
                filename = "result" + str(count) + ".pdf"
                filepath = os.path.join(result_pdf_path, filename)
                f = open(filepath, "rb")
                PdfFileReader(f)          # Checking if valid pdf file
                f.close()
                cells = [pdf.process_page(filepath, p)
                         for p in pages]
                cells = [item for sublist in cells for item in sublist]
                li = pdf.table_to_list(cells, pages)[1]
                for i in li:
                    if 'Branch' in i[0]:
                        collegepos = i[0].index('College : ')
                        branchpos = i[0].index('Branch : ')
                        namepos = i[0].index('Name : ')
                        registerpos = i[0].index('Register No : ')
                        exampos = i[0].index('Exam Name : ')
                        college = i[0][collegepos:branchpos][
                            9:].strip().title()
                        branch = i[0][branchpos:namepos][9:].strip().title()
                        exam = i[0][exampos:][11:].strip().title()
                        register = i[0][registerpos:exampos][13:].strip()
                        name = i[0][namepos:registerpos][7:].strip()
                        if college not in self.result_subject:
                            self.result_subject[college] = {}
                        if college not in self.result_register:
                            self.result_register[college] = {}
                        if college not in self.registers:
                            self.registers[college] = {}
                        if branch not in self.result_subject[college]:
                            self.result_subject[college][branch] = {}
                        if branch not in self.result_register[college]:
                            self.result_register[college][branch] = {}
                        if branch not in self.registers[college]:
                            self.registers[college][branch] = []
                        if branch not in self.subjects:
                            self.subjects[branch] = []
                    elif 'Mahatma' in i[0]:
                        pass
                    elif 'Sl. No' in i[0]:
                        pass
                    elif 'Semester Result' in i[1]:
                        pass
                    else:
                        subject = [i][0][1]
                        internal = i[2]
                        external = i[3]
                        if internal == '-':
                            internal = 0
                        else:
                            internal = int(internal)
                        if external == '-':
                            external = 0
                        else:
                            external = int(external)
                        res = i[5]
                        if register not in self.registers[college][branch]:
                            self.registers[college][branch].append(register)
                        if subject not in self.subjects[branch]:
                            self.subjects[branch].append(subject)
                        if register not in self.result_register[
                                college][branch]:
                            self.result_register[college][
                                branch][register] = {}
                        self.result_register[college][
                            branch][register]["name"] = name
                        if subject not in self.result_register[
                                college][branch][register]:
                            self.result_register[college][
                                branch][register][subject] = {}
                        self.result_register[college][branch][register][subject] = \
                            [internal, external, internal + external, res]

                        if subject not in self.result_subject[college][branch]:
                            self.result_subject[college][branch][subject] = {}
                        self.result_subject[college][branch][subject][register] = \
                            [external, res]
                current = self.parent.progressbar2.value()
                unit = 100.0 / float(end - start)
                if current == -1:
                    current = 0
                self.parent.progressbar2.setValue(current + unit)
            except Exception as e:
                self.badresult.append(count)
                continue
        self.parent.progressbar2.setValue(100)
        jsonout = json.dumps(self.result_register, indent=4)
        json1path = os.path.join(parentfolder, 'output_register.json')
        outfile = open(json1path, 'w')
        outfile.write(jsonout)
        outfile.close()
        jsonout2 = json.dumps(self.result_subject, indent=4)
        json2path = os.path.join(parentfolder, 'output_subject.json')
        outfile2 = open(json2path, 'w')
        outfile2.write(jsonout2)
        outfile2.close()
        return self.badresult

示例#11

0

显示文件

文件： getresult.py 项目： balasankarc/asiet_results

def process(start, end):
    '''This method processes the specified results and populate necessary data
    structures.'''
    global result, exam
    badresult = []
    for count in range(start, end + 1):
        try:
            if verbosity == 1:
                print "Roll Number #", count
            else:
                sys.stdout.write(
                    "\r%.2f%%" % (float(count - start) * 100 / (end - start)))
                sys.stdout.flush()
            pages = ["1"]
            f = open("result" + str(count) + ".pdf", "rb")
            PdfFileReader(f)          # Checking if valid pdf file
            f.close()
            cells = [pdf.process_page("result" + str(count) + ".pdf", p)
                     for p in pages]
            cells = [item for sublist in cells for item in sublist]
            li = pdf.table_to_list(cells, pages)[1]
            for i in li:
                if 'Branch' in i[0]:
                    collegepos = i[0].index('College : ')
                    branchpos = i[0].index('Branch : ')
                    namepos = i[0].index('Name : ')
                    registerpos = i[0].index('Register No : ')
                    exampos = i[0].index('Exam Name : ')
                    college = i[0][collegepos:branchpos][9:].strip().title()
                    branch = i[0][branchpos:namepos][9:].strip().title()
                    exam = i[0][exampos:][11:].strip().title()
                    register = i[0][registerpos:exampos][13:].strip()
                    if college not in result:
                        result[college] = {}
                    if branch not in result[college]:
                        result[college][branch] = {}
                elif 'Mahatma' in i[0]:
                    pass
                elif 'Sl. No' in i[0]:
                    pass
                elif 'Semester Result' in i[1]:
                    pass
                else:
                    subject = [i][0][1]
                    internal = i[2]
                    external = i[3]
                    if internal == '-':
                        internal = 0
                    else:
                        internal = int(internal)
                    if external == '-':
                        external = 0
                    else:
                        external = int(external)
                    res = i[5]
                    if subject not in result[college][branch]:
                        result[college][branch][subject] = {}
                    result[college][branch][subject][register] = \
                        [external, res]
        except:
            badresult.append(count)
            continue
    if(len(badresult) > 0):
        print "\nUnavailable Results Skipped"
        for invalid in badresult:
            print "Roll Number #", invalid
    jsonout = json.dumps(result)
    outfile = open('output.json', 'w')
    outfile.write(jsonout)
    outfile.close()
    print ""

示例#12

0

显示文件

def transform(filename, console=None):
    write(console, "  - Parsing...", ending='')

    tablePdf = pdf.PdfFileReader(file(filename, 'rb'))
    pages = [str(p) for p in range(1, len(tablePdf.pages) + 1)]

    # Procesing cells and flattern cells structure
    cells = [pdftable.process_page(filename, p) for p in pages]
    cells = [item for sublist in cells for item in sublist]

    table = []

    write(console, "done")

    for page_table in pdftable.table_to_list(cells, pages):

        row_msg_format = "\r\033[K  - %d/%d records transformed"

        page_table_rows = len(page_table)
        idx = 0

        while idx < page_table_rows:
            write(console, row_msg_format % (idx, page_table_rows), ending='')

            cell_len = sum([len(i) for i in page_table[idx]])

            if (cell_len == 0) or is_header_text(page_table[idx][0]):
                del page_table[idx]
                page_table_rows -= 1
            else:
                # Unicode for all
                for subidx in xrange(len(page_table[idx])):
                    page_table[idx][subidx] = unicode(
                        page_table[idx][subidx], encoding='utf-8').strip()

                # Cases "1. medicament one 2. medicament two"
                if re.search(r'\d\.\W\W', page_table[idx][3]):
                    re_split_cases = r'\W?\d+\.\W\W'
                    splitted_products = re.split(re_split_cases,
                                                 page_table[idx][3])[1:]
                    splitted_vendors = re.split(re_split_cases,
                                                page_table[idx][4])[1:]
                    added_products = len(splitted_products)

                    if not len(splitted_vendors):
                        splitted_vendors = [page_table[idx][4]
                                            ] * added_products

                    for i in xrange(0, added_products):
                        if i > 0:
                            row_copy = list(page_table[idx])
                            idx += 1
                            page_table_rows += 1

                            page_table.insert(idx, row_copy)

                        page_table[idx][3] = unicode(
                            splitted_products[i]).encode('utf-8')
                        page_table[idx][4] = unicode(
                            splitted_vendors[i]).encode('utf-8')

                idx += 1

        table += page_table

        write(console, row_msg_format % (idx, page_table_rows), ending='')

    write(console, '')

    return table

示例#13

0

显示文件

文件： test_to_pandas.py 项目： mgaitan/pdf-table-extract

import pandas as pd
import pdftableextract as pdf

pages = ["2"]
cells = [pdf.process_page("a.pdf",p) for p in pages]

#flatten the cells structure
cells = [item for sublist in cells for item in sublist ]

#without any options, process_page picks up a blank table at the top of the page.
#so choose table '1'
li = pdf.table_to_list(cells, pages)[1]

#li is a list of lists, the first line is the header, last is the footer (for this table only!)
#column '0' contains store names
#row '1' contains column headings
#data is row '2' through '-1'

data =pd.DataFrame(li[2:-1], columns=li[1], index=[l[0] for l in li[2:-1]])

示例#14

0

显示文件

文件： nursery_pdftocsv.py 项目： hkwi/our-data

def proc(input, output):
	if os.path.exists(output) and os.stat(output).st_mtime > os.stat(input).st_mtime:
		return
	rs = pte.table_to_list(pte.process_page(input, "1"), 1)
	w = csv.writer(open(output, "w"))
	w.writerows(rs[1])

示例#15

0

显示文件

import pdftableextract as pdf
import csv

pages = [str(i) for i in range(1, 7)]
cells = [
    pdf.process_page("./List_of_current_step_companies.pdf", p) for p in pages
]

# fatten cells
cells = [item for sublist in cells for item in sublist]

# XXX: I'm not able to list the companies on page 6
company_names = [col[-1] for col in cells if col[0] is 1]
founders = [col[-1] for col in cells if col[0] is 2]
email_ids = [col[-1] for col in cells if col[0] is 3]

companies = zip(company_names, founders, email_ids)
list_file = open('list.md', 'w')
list_file.write("\n".join(company_names))
list_file.close()

with open("list.csv", "wb") as f:
    writer = csv.writer(f)
    writer.writerows(companies)

示例#16

0

显示文件

文件： test_to_pandas.py 项目： eugeneai/pdf-table-extract

from __future__ import print_function
import pandas as pd
import pdftableextract as pdf

pages = ["1"]

cells = [pdf.process_page("example.pdf",
                          p,
                          outfilename="pandas-test",
                          bitmap_resolution=100,
                          checkall=False) for p in pages]

#flatten the cells structure
cells = [item for sublist in cells for item in sublist]

#without any options, process_page picks up a blank table at the top of the page.
#so choose table '1'
li = pdf.table_to_list(cells, pages)[1]

#li is a list of lists, the first line is the header, last is the footer (for this table only!)
#column '0' contains store names
#row '1' contains column headings
#data is row '2' through '-1'

data = pd.DataFrame(li[2:-1], columns=li[1], index=[l[0] for l in li[2:-1]])
print(data)