def tables(self): with env(save=True): if self.is_pdf: d = hashlib.md5() d.update(self.open().read()) h = d.hexdigest() with meta(env.get("meta")) as g: h2 = g.value(rdflib.URIRef(self.url), NS1["md5"]) if not h2 or h != h2.value: for f in glob.glob(re.sub(r".pdf", "_*.csv", self.local)): os.remove(f) out = subprocess.check_output(("pdfinfo", self.local)).decode(sys.stdout.encoding) pages = None for l in io.StringIO(out): m = re.match(r"^Pages:\s*(\d+)", l) if m: pages = int(m.group(1)) assert pages for i in range(pages): p = i+1 try: cells = pte.process_page(self.local, str(p), whitespace="raw") except: print(self.local, p, i) raise out = re.sub(r".pdf", "_%d.csv" % i, self.local) try: pte.output(cells, i, table_csv_filename=out) except: logging.error(self.local, exc_info=True) raise g.set((rdflib.URIRef(self.url), NS1["md5"], rdflib.Literal(h))) for f in glob.glob(re.sub(r".pdf", "_*.csv", self.local)): yield list(csv.reader(open(f))) else: def txt(obj): if obj is None: return "" return obj.text_content() root = self.root if root is None: return iter([]) css = self.env.get("css", None) if css is None: for tb in root.cssselect("table"): yield Table(tb).matrix(txt) else: for main in root.cssselect(css): for tb in main.cssselect("table"): yield Table(tb).matrix(txt)
#coding: utf-8 import pandas as pd import pdftableextract as pdf pages = ["95"] #pages = ["84"] #cells = [pdf.process_page("example.pdf", p) for p in pages] #cells = [pdf.process_page("./2015_601628.pdf", p) for p in pages] cells = [pdf.process_page("./2015_002594.pdf", p) for p in pages] #cells = [pdf.process_page("./2015_601766.pdf", p) for p in pages] #cells = [pdf.process_page("table.pdf", p) for p in pages] #cells = [pdf.process_page("table_up.pdf", p) for p in pages] #cells = [pdf.process_page("table_down.pdf", p) for p in pages] #cells = [pdf.process_page("table_down_1.pdf", p) for p in pages] #exit(0) #cells: [[(col, row, ?, ?, ?, "content of the cell"), (col, row, ?, ?, ?, "content of the cell"),..., (col, row, ?, ?, ?, "content of the cell")]] #print "type(cells): ", type(cells) #<type 'list'> list of list #print "cells: ", cells #lxw NOTE: <col, row, colspan?, ?, ?, "content of the cell">, what do the last three "?" mean? t """ print "cells:" for cell in cells: #cells: list of list print "len(cells) == 1? ", len(cells) for item in cell: #cell: list of tuple tempStr = "" for element in item: #item: tuple tempStr += str(element) + ", " print tempStr
import csv import pdftableextract as pte import csv import os import os.path import glob for f in glob.glob("www.city.kobe.lg.jp/child/grow/shinseido/img/*.pdf"): fn = os.path.basename(f) out = "shinseido/%s.ttl" % fn if os.path.exists(out) and os.stat(out).st_mtime >= os.stat(f).st_mtime: continue info = [r for r in csv.DictReader(open("shinseido_meta/index.csv", encoding="UTF-8")) if r["file"] == fn] assert info, fn pages = [int(info[0][k]) for k in ("start","end")] assert pages, fn for page in range(*pages): a = pte.process_page(f, str(page)) x = pte.table_to_list(a, page) with open("shinseido/%s.p%02d.csv" % (fn, page), "w") as o: w = csv.writer(o) w.writerows(x[-1])
import pandas as pd import pdftableextract as pdf pages = ["1"] cells = [pdf.process_page("example.pdf",p) for p in pages] #flatten the cells structure cells = [item for sublist in cells for item in sublist ] #without any options, process_page picks up a blank table at the top of the page. #so choose table '1' li = pdf.table_to_list(cells, pages)[1] #li is a list of lists, the first line is the header, last is the footer (for this table only!) #column '0' contains store names #row '1' contains column headings #data is row '2' through '-1' data =pd.DataFrame(li[2:-1], columns=li[1], index=[l[0] for l in li[2:-1]])
import pandas as pd import pdftableextract as pdf pages = ["1"] cells = [pdf.process_page("example.pdf",p) for p in pages] #flatten the cells structure cells = [item for sublist in cells for item in sublist ] #without any options, process_page picks up a blank table at the top of the page. #so choose table '1' li = pdf.table_to_list(cells, pages)[1] #li is a list of lists, the first line is the header, last is the footer (for this table only!) #column '0' contains store names #row '1' contains column headings #data is row '2' through '-1' data =pd.DataFrame(li[2:-1], columns=li[1], index=[l[0] for l in li[2:-1]]) print data
def search((inputFile,search)): baseName = os.path.basename(inputFile) inputName, inputExtension = os.path.splitext(baseName) print inputName regionName = inputFile.split("/")[-2] fr = open(inputFile, 'rb') try: pdf = pyPdf.PdfFileReader(fr) pages = pdf.getNumPages() count = 0 hits = [] for page in range(0,pages): spin() pdf_page = pdf.getPage(page) text = pdf_page.extractText() if text.find(search)>-1: if count < 2: hits.append(str(page+1)) count += 1 fr.close() cells = [pdfextract.process_page(inputFile,p) for p in hits] cells = [item for sublist in cells for item in sublist ] li = pdfextract.table_to_list(cells, hits) if len(li)>1: table1 = li[-2][1:-1] table2 = li[-1][1:-1] data = [] for row in table1: spin() parsedText = re.findall("[^0-9]{2,}",row[0]) if len(parsedText)>1: for i in range(0,len(parsedText)): text = parsedText[i] startIndex = row[0].index(text)+len(text) endIndex = row[0].index(parsedText[i+1]) if i+1<len(parsedText) else len(row[0]) parsedNum = row[0][startIndex:endIndex].split(" ") parsedRow = [] parsedRow.append(text.strip()) parsedRow.extend(parsedNum) if(len(parsedRow)>1): parsedRow.insert(0,inputName) parsedRow.insert(0,regionName) data.append(parsedRow) elif len(parsedText)==1: parsedRow = [] parsedRow.append(parsedText[0].strip()) parsedNum = re.findall("[0-9.,]+",row[0][len(parsedText[0]):]) parsedRow.extend(parsedNum) if(len(parsedRow)>1): parsedRow.insert(0,inputName) parsedRow.insert(0,regionName) data.append(parsedRow) else: parsedRow = [] parsedRow.append("") parsedNum = re.findall("[0-9.,]+",row[0]) parsedRow.extend(parsedNum) if(len(parsedRow)>1): parsedRow.insert(0,inputName) parsedRow.insert(0,regionName) data.append(parsedRow) for row in table2: spin() if row[0].find("Page")==-1: parsedText = re.findall("[^0-9]{2,}",row[0]) if len(parsedText)>1: for i in range(0,len(parsedText)): text = parsedText[i] startIndex = row[0].index(text)+len(text) endIndex = row[0].index(parsedText[i+1]) if i+1<len(parsedText) else len(row[0]) parsedNum = row[0][startIndex:endIndex].split(" ") parsedRow = [] parsedRow.append(text.strip()) parsedRow.extend(parsedNum) if(len(parsedRow)>1): parsedRow.insert(0,inputName) parsedRow.insert(0,regionName) data.append(parsedRow) elif len(parsedText)==1: parsedRow = [] parsedRow.append(parsedText[0].strip()) parsedNum = re.findall("[0-9.,]+",row[0][len(parsedText[0]):]) parsedRow.extend(parsedNum) if(len(parsedRow)>1): parsedRow.insert(0,inputName) parsedRow.insert(0,regionName) data.append(parsedRow) else: parsedRow = [] parsedRow.append("") parsedNum = re.findall("[0-9.,]+",row[0]) parsedRow.extend(parsedNum) if(len(parsedRow)>1): parsedRow.insert(0,inputName) parsedRow.insert(0,regionName) data.append(parsedRow) cols = ["Region","District","Sector/MDA/MMDA","Central GOG and CF: Comp of Emp","Central GOG and CF: Goods/Service","Central GOG and CF: Assets (Capital)","Central GOG and CF: Total","IGF: Comp of Emp","IGF: Goods/Service","IGF: Assets (Capital)","IGF: Total","Funds/Others: Comp of Emp","Funds/Others: Goods/Service","Funds/Others: Assets (Capital)","Funds/Others: Total","Donor: Comp of Emp","Donor: Goods/Service","Donor: Assets (Capital)","Donor: Total","Grand Total Less NREG / Statutory"] data = pd.DataFrame(data,columns=cols) return data else: cols = ["Region","District","Sector/MDA/MMDA","Central GOG and CF: Comp of Emp","Central GOG and CF: Goods/Service","Central GOG and CF: Assets (Capital)","Central GOG and CF: Total","IGF: Comp of Emp","IGF: Goods/Service","IGF: Assets (Capital)","IGF: Total","Funds/Others: Comp of Emp","Funds/Others: Goods/Service","Funds/Others: Assets (Capital)","Funds/Others: Total","Donor: Comp of Emp","Donor: Goods/Service","Donor: Assets (Capital)","Donor: Total","Grand Total Less NREG / Statutory"] data = [[regionName,inputName,"ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR"]] data = pd.DataFrame(data,columns=cols) return data except: cols = ["Region","District","Sector/MDA/MMDA","Central GOG and CF: Comp of Emp","Central GOG and CF: Goods/Service","Central GOG and CF: Assets (Capital)","Central GOG and CF: Total","IGF: Comp of Emp","IGF: Goods/Service","IGF: Assets (Capital)","IGF: Total","Funds/Others: Comp of Emp","Funds/Others: Goods/Service","Funds/Others: Assets (Capital)","Funds/Others: Total","Donor: Comp of Emp","Donor: Goods/Service","Donor: Assets (Capital)","Donor: Total","Grand Total Less NREG / Statutory"] data = [[regionName,inputName,"ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR"]] data = pd.DataFrame(data,columns=cols) return data
def transform(filename, console=None): write(console, " - Parsing...", ending='') tablePdf = pdf.PdfFileReader (file(filename, 'rb')) pages = [ str(p) for p in range(1, len(tablePdf.pages) + 1)] # Procesing cells and flattern cells structure cells = [pdftable.process_page(filename, p) for p in pages] cells = [item for sublist in cells for item in sublist] table = [] write(console, "done") for page_table in pdftable.table_to_list(cells, pages): row_msg_format = "\r\033[K - %d/%d records transformed" page_table_rows = len(page_table) idx = 0 while idx < page_table_rows: write(console, row_msg_format % (idx, page_table_rows), ending='') cell_len = sum([len(i) for i in page_table[idx]]) if (cell_len == 0) or is_header_text(page_table[idx][0]): del page_table[idx] page_table_rows -= 1 else: # Unicode for all for subidx in xrange(len(page_table[idx])): page_table[idx][subidx] = unicode(page_table[idx][subidx], encoding='utf-8').strip() # Cases "1. medicament one 2. medicament two" if re.search(r'\d\.\W\W', page_table[idx][3]): re_split_cases = r'\W?\d+\.\W\W' splitted_products = re.split(re_split_cases, page_table[idx][3])[1:] splitted_vendors = re.split(re_split_cases, page_table[idx][4])[1:] added_products = len(splitted_products) if not len(splitted_vendors): splitted_vendors = [ page_table[idx][4] ] * added_products for i in xrange(0, added_products): if i > 0: row_copy = list(page_table[idx]) idx += 1 page_table_rows += 1 page_table.insert(idx, row_copy) page_table[idx][3] = unicode(splitted_products[i]).encode('utf-8') page_table[idx][4] = unicode(splitted_vendors[i]).encode('utf-8') idx += 1 table += page_table write(console, row_msg_format % (idx, page_table_rows), ending='') write(console, '') return table
# Get current date and time: gentime = datetime.datetime.now() # Querry server with authentification pdfurl = "https://gho.berlin/wp-content/frei_stunden/VPS.pdf" response = requests.get(pdfurl, auth=HTTPBasicAuth("<username>", "<password>")) pdffilename = "vertretungsplan.pdf" # extract table outputfile = open(pdffilename, "wb") outputfile.write(response.content) outputfile.close pages = ["2"] cells = [pdf.process_page(pdffilename, p) for p in pages] os.remove(pdffilename) # flatten the cells structure cells = [item for sublist in cells for item in sublist] li = pdf.table_to_list(cells, pages)[2] # Write Json into file jsonfilename = "vertretungsplan.json" if os.path.isfile(jsonfilename): # Remove old versions of the file os.remove(jsonfilename)
def convert_pdf_to_text(src, dest): cells = pdf.process_page(src, "1") with open(dest, "w") as f: for cell in cells: f.write(cell[5]) f.write("\n")
def process(self, start, end, parentfolder): ''' This method processes the specified results and populate necessary data structures. ''' self.badresult = [] self.registers = {} self.subjects = {} result_pdf_path = os.path.join(parentfolder, 'Results') for count in range(start, end + 1): try: pages = ["1"] filename = "result" + str(count) + ".pdf" filepath = os.path.join(result_pdf_path, filename) f = open(filepath, "rb") PdfFileReader(f) # Checking if valid pdf file f.close() cells = [pdf.process_page(filepath, p) for p in pages] cells = [item for sublist in cells for item in sublist] li = pdf.table_to_list(cells, pages)[1] for i in li: if 'Branch' in i[0]: collegepos = i[0].index('College : ') branchpos = i[0].index('Branch : ') namepos = i[0].index('Name : ') registerpos = i[0].index('Register No : ') exampos = i[0].index('Exam Name : ') college = i[0][collegepos:branchpos][ 9:].strip().title() branch = i[0][branchpos:namepos][9:].strip().title() exam = i[0][exampos:][11:].strip().title() register = i[0][registerpos:exampos][13:].strip() name = i[0][namepos:registerpos][7:].strip() if college not in self.result_subject: self.result_subject[college] = {} if college not in self.result_register: self.result_register[college] = {} if college not in self.registers: self.registers[college] = {} if branch not in self.result_subject[college]: self.result_subject[college][branch] = {} if branch not in self.result_register[college]: self.result_register[college][branch] = {} if branch not in self.registers[college]: self.registers[college][branch] = [] if branch not in self.subjects: self.subjects[branch] = [] elif 'Mahatma' in i[0]: pass elif 'Sl. No' in i[0]: pass elif 'Semester Result' in i[1]: pass else: subject = [i][0][1] internal = i[2] external = i[3] if internal == '-': internal = 0 else: internal = int(internal) if external == '-': external = 0 else: external = int(external) res = i[5] if register not in self.registers[college][branch]: self.registers[college][branch].append(register) if subject not in self.subjects[branch]: self.subjects[branch].append(subject) if register not in self.result_register[ college][branch]: self.result_register[college][ branch][register] = {} self.result_register[college][ branch][register]["name"] = name if subject not in self.result_register[ college][branch][register]: self.result_register[college][ branch][register][subject] = {} self.result_register[college][branch][register][subject] = \ [internal, external, internal + external, res] if subject not in self.result_subject[college][branch]: self.result_subject[college][branch][subject] = {} self.result_subject[college][branch][subject][register] = \ [external, res] current = self.parent.progressbar2.value() unit = 100.0 / float(end - start) if current == -1: current = 0 self.parent.progressbar2.setValue(current + unit) except Exception as e: self.badresult.append(count) continue self.parent.progressbar2.setValue(100) jsonout = json.dumps(self.result_register, indent=4) json1path = os.path.join(parentfolder, 'output_register.json') outfile = open(json1path, 'w') outfile.write(jsonout) outfile.close() jsonout2 = json.dumps(self.result_subject, indent=4) json2path = os.path.join(parentfolder, 'output_subject.json') outfile2 = open(json2path, 'w') outfile2.write(jsonout2) outfile2.close() return self.badresult
def process(start, end): '''This method processes the specified results and populate necessary data structures.''' global result, exam badresult = [] for count in range(start, end + 1): try: if verbosity == 1: print "Roll Number #", count else: sys.stdout.write( "\r%.2f%%" % (float(count - start) * 100 / (end - start))) sys.stdout.flush() pages = ["1"] f = open("result" + str(count) + ".pdf", "rb") PdfFileReader(f) # Checking if valid pdf file f.close() cells = [pdf.process_page("result" + str(count) + ".pdf", p) for p in pages] cells = [item for sublist in cells for item in sublist] li = pdf.table_to_list(cells, pages)[1] for i in li: if 'Branch' in i[0]: collegepos = i[0].index('College : ') branchpos = i[0].index('Branch : ') namepos = i[0].index('Name : ') registerpos = i[0].index('Register No : ') exampos = i[0].index('Exam Name : ') college = i[0][collegepos:branchpos][9:].strip().title() branch = i[0][branchpos:namepos][9:].strip().title() exam = i[0][exampos:][11:].strip().title() register = i[0][registerpos:exampos][13:].strip() if college not in result: result[college] = {} if branch not in result[college]: result[college][branch] = {} elif 'Mahatma' in i[0]: pass elif 'Sl. No' in i[0]: pass elif 'Semester Result' in i[1]: pass else: subject = [i][0][1] internal = i[2] external = i[3] if internal == '-': internal = 0 else: internal = int(internal) if external == '-': external = 0 else: external = int(external) res = i[5] if subject not in result[college][branch]: result[college][branch][subject] = {} result[college][branch][subject][register] = \ [external, res] except: badresult.append(count) continue if(len(badresult) > 0): print "\nUnavailable Results Skipped" for invalid in badresult: print "Roll Number #", invalid jsonout = json.dumps(result) outfile = open('output.json', 'w') outfile.write(jsonout) outfile.close() print ""
def transform(filename, console=None): write(console, " - Parsing...", ending='') tablePdf = pdf.PdfFileReader(file(filename, 'rb')) pages = [str(p) for p in range(1, len(tablePdf.pages) + 1)] # Procesing cells and flattern cells structure cells = [pdftable.process_page(filename, p) for p in pages] cells = [item for sublist in cells for item in sublist] table = [] write(console, "done") for page_table in pdftable.table_to_list(cells, pages): row_msg_format = "\r\033[K - %d/%d records transformed" page_table_rows = len(page_table) idx = 0 while idx < page_table_rows: write(console, row_msg_format % (idx, page_table_rows), ending='') cell_len = sum([len(i) for i in page_table[idx]]) if (cell_len == 0) or is_header_text(page_table[idx][0]): del page_table[idx] page_table_rows -= 1 else: # Unicode for all for subidx in xrange(len(page_table[idx])): page_table[idx][subidx] = unicode( page_table[idx][subidx], encoding='utf-8').strip() # Cases "1. medicament one 2. medicament two" if re.search(r'\d\.\W\W', page_table[idx][3]): re_split_cases = r'\W?\d+\.\W\W' splitted_products = re.split(re_split_cases, page_table[idx][3])[1:] splitted_vendors = re.split(re_split_cases, page_table[idx][4])[1:] added_products = len(splitted_products) if not len(splitted_vendors): splitted_vendors = [page_table[idx][4] ] * added_products for i in xrange(0, added_products): if i > 0: row_copy = list(page_table[idx]) idx += 1 page_table_rows += 1 page_table.insert(idx, row_copy) page_table[idx][3] = unicode( splitted_products[i]).encode('utf-8') page_table[idx][4] = unicode( splitted_vendors[i]).encode('utf-8') idx += 1 table += page_table write(console, row_msg_format % (idx, page_table_rows), ending='') write(console, '') return table
import pandas as pd import pdftableextract as pdf pages = ["2"] cells = [pdf.process_page("a.pdf",p) for p in pages] #flatten the cells structure cells = [item for sublist in cells for item in sublist ] #without any options, process_page picks up a blank table at the top of the page. #so choose table '1' li = pdf.table_to_list(cells, pages)[1] #li is a list of lists, the first line is the header, last is the footer (for this table only!) #column '0' contains store names #row '1' contains column headings #data is row '2' through '-1' data =pd.DataFrame(li[2:-1], columns=li[1], index=[l[0] for l in li[2:-1]])
def proc(input, output): if os.path.exists(output) and os.stat(output).st_mtime > os.stat(input).st_mtime: return rs = pte.table_to_list(pte.process_page(input, "1"), 1) w = csv.writer(open(output, "w")) w.writerows(rs[1])
import pdftableextract as pdf import csv pages = [str(i) for i in range(1, 7)] cells = [ pdf.process_page("./List_of_current_step_companies.pdf", p) for p in pages ] # fatten cells cells = [item for sublist in cells for item in sublist] # XXX: I'm not able to list the companies on page 6 company_names = [col[-1] for col in cells if col[0] is 1] founders = [col[-1] for col in cells if col[0] is 2] email_ids = [col[-1] for col in cells if col[0] is 3] companies = zip(company_names, founders, email_ids) list_file = open('list.md', 'w') list_file.write("\n".join(company_names)) list_file.close() with open("list.csv", "wb") as f: writer = csv.writer(f) writer.writerows(companies)
from __future__ import print_function import pandas as pd import pdftableextract as pdf pages = ["1"] cells = [pdf.process_page("example.pdf", p, outfilename="pandas-test", bitmap_resolution=100, checkall=False) for p in pages] #flatten the cells structure cells = [item for sublist in cells for item in sublist] #without any options, process_page picks up a blank table at the top of the page. #so choose table '1' li = pdf.table_to_list(cells, pages)[1] #li is a list of lists, the first line is the header, last is the footer (for this table only!) #column '0' contains store names #row '1' contains column headings #data is row '2' through '-1' data = pd.DataFrame(li[2:-1], columns=li[1], index=[l[0] for l in li[2:-1]]) print(data)