Exemplo n.º 1
0
import csv
import pdftableextract as pte
import csv
import os
import os.path
import glob

for f in glob.glob("www.city.kobe.lg.jp/child/grow/shinseido/img/*.pdf"):
	fn = os.path.basename(f)
	out = "shinseido/%s.ttl" % fn
	if os.path.exists(out) and os.stat(out).st_mtime >= os.stat(f).st_mtime:
		continue
	
	info = [r for r in csv.DictReader(open("shinseido_meta/index.csv", encoding="UTF-8"))
		if r["file"] == fn]
	assert info, fn
	
	pages = [int(info[0][k]) for k in ("start","end")]
	assert pages, fn
	
	for page in range(*pages):
		a = pte.process_page(f, str(page))
		x = pte.table_to_list(a, page)
		with open("shinseido/%s.p%02d.csv" % (fn, page), "w") as o:
			w = csv.writer(o)
			w.writerows(x[-1])
Exemplo n.º 2
0
"""
print "cells:"
for cell in cells:	#cells: list of tuple
	tempStr = ""
	for item in cell: #cell: tuple
		tempStr += str(item) + ", "
	print tempStr
"""

#check whether to deal with the multiple tables in the same pages here? SEEMED NOT, no obvious differences between two different tables.(通过横跨所有列来判断,也不保险,因为有些表格的中间可能存在一行横跨所有列的数据)

#------------------------------------------------------------------------------------------------------------------------------
#without any options, process_page picks up a blank table at the top of the page.
#so choose table '1'
li = pdf.table_to_list(cells, pages)[-1]
print "li: " 	#type(li)	#<type 'list'> list of list
for line in li:
	print ", ".join(line)
"""
li:
[
["content of cell 0 in row 0", "content of cell 1 in row 0",... , "content of cell n in row 0"], 
["content of cell 0 in row 1", "content of cell 1 in row 1",... , "content of cell n in row 1"],
...
["content of cell 0 in row m", "content of cell 1 in row m",... , "content of cell n in row m"]
]
"""

#li is a list of lists, the first line is the header, last is the footer (for this table only!)
#column '0' contains store names
def search((inputFile,search)):
    baseName = os.path.basename(inputFile)
    inputName, inputExtension = os.path.splitext(baseName)
    print inputName
    regionName = inputFile.split("/")[-2]
    fr = open(inputFile, 'rb')
    try:
        pdf = pyPdf.PdfFileReader(fr)
        pages = pdf.getNumPages()
        count = 0
        hits = []
        for page in range(0,pages):
            spin()
            pdf_page = pdf.getPage(page) 
            text = pdf_page.extractText()
            if text.find(search)>-1:
                if count < 2:
                    hits.append(str(page+1))
                    count += 1
        fr.close()
        cells = [pdfextract.process_page(inputFile,p) for p in hits]
        cells = [item for sublist in cells for item in sublist ]
        li = pdfextract.table_to_list(cells, hits)
        if len(li)>1:
            table1 = li[-2][1:-1]
            table2 = li[-1][1:-1]
            data = []
            for row in table1:
                spin()
                parsedText = re.findall("[^0-9]{2,}",row[0])
                if len(parsedText)>1:
                    for i in range(0,len(parsedText)):
                        text = parsedText[i]
                        startIndex = row[0].index(text)+len(text)
                        endIndex = row[0].index(parsedText[i+1]) if i+1<len(parsedText) else len(row[0])
                        parsedNum = row[0][startIndex:endIndex].split(" ")
                        parsedRow = []
                        parsedRow.append(text.strip())
                        parsedRow.extend(parsedNum)
                        if(len(parsedRow)>1):
                            parsedRow.insert(0,inputName)
                            parsedRow.insert(0,regionName)
                            data.append(parsedRow)
                elif len(parsedText)==1:
                    parsedRow = []
                    parsedRow.append(parsedText[0].strip())
                    parsedNum = re.findall("[0-9.,]+",row[0][len(parsedText[0]):])
                    parsedRow.extend(parsedNum)
                    if(len(parsedRow)>1):
                        parsedRow.insert(0,inputName)
                        parsedRow.insert(0,regionName)
                        data.append(parsedRow)
                else:
                    parsedRow = []
                    parsedRow.append("")
                    parsedNum = re.findall("[0-9.,]+",row[0])
                    parsedRow.extend(parsedNum)
                    if(len(parsedRow)>1):
                        parsedRow.insert(0,inputName)
                        parsedRow.insert(0,regionName)
                        data.append(parsedRow)
            for row in table2:
                spin()
                if row[0].find("Page")==-1:
                    parsedText = re.findall("[^0-9]{2,}",row[0])
                    if len(parsedText)>1:
                        for i in range(0,len(parsedText)):
                            text = parsedText[i]
                            startIndex = row[0].index(text)+len(text)
                            endIndex = row[0].index(parsedText[i+1]) if i+1<len(parsedText) else len(row[0])
                            parsedNum = row[0][startIndex:endIndex].split(" ")
                            parsedRow = []
                            parsedRow.append(text.strip())
                            parsedRow.extend(parsedNum)
                            if(len(parsedRow)>1):
                                parsedRow.insert(0,inputName)
                                parsedRow.insert(0,regionName)
                                data.append(parsedRow)
                    elif len(parsedText)==1:
                        parsedRow = []
                        parsedRow.append(parsedText[0].strip())
                        parsedNum = re.findall("[0-9.,]+",row[0][len(parsedText[0]):])
                        parsedRow.extend(parsedNum)
                        if(len(parsedRow)>1):
                            parsedRow.insert(0,inputName)
                            parsedRow.insert(0,regionName)
                            data.append(parsedRow)
                    else:
                        parsedRow = []
                        parsedRow.append("")
                        parsedNum = re.findall("[0-9.,]+",row[0])
                        parsedRow.extend(parsedNum)
                        if(len(parsedRow)>1):
                            parsedRow.insert(0,inputName)
                            parsedRow.insert(0,regionName)
                            data.append(parsedRow)
            cols = ["Region","District","Sector/MDA/MMDA","Central GOG and CF: Comp of Emp","Central GOG and CF: Goods/Service","Central GOG and CF: Assets (Capital)","Central GOG and CF: Total","IGF: Comp of Emp","IGF: Goods/Service","IGF: Assets (Capital)","IGF: Total","Funds/Others: Comp of Emp","Funds/Others: Goods/Service","Funds/Others: Assets (Capital)","Funds/Others: Total","Donor: Comp of Emp","Donor: Goods/Service","Donor: Assets (Capital)","Donor: Total","Grand Total Less NREG / Statutory"]
            data = pd.DataFrame(data,columns=cols)
            return data
        else:
            cols = ["Region","District","Sector/MDA/MMDA","Central GOG and CF: Comp of Emp","Central GOG and CF: Goods/Service","Central GOG and CF: Assets (Capital)","Central GOG and CF: Total","IGF: Comp of Emp","IGF: Goods/Service","IGF: Assets (Capital)","IGF: Total","Funds/Others: Comp of Emp","Funds/Others: Goods/Service","Funds/Others: Assets (Capital)","Funds/Others: Total","Donor: Comp of Emp","Donor: Goods/Service","Donor: Assets (Capital)","Donor: Total","Grand Total Less NREG / Statutory"]
            data = [[regionName,inputName,"ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR"]]
            data = pd.DataFrame(data,columns=cols)
            return data
    except:
        cols = ["Region","District","Sector/MDA/MMDA","Central GOG and CF: Comp of Emp","Central GOG and CF: Goods/Service","Central GOG and CF: Assets (Capital)","Central GOG and CF: Total","IGF: Comp of Emp","IGF: Goods/Service","IGF: Assets (Capital)","IGF: Total","Funds/Others: Comp of Emp","Funds/Others: Goods/Service","Funds/Others: Assets (Capital)","Funds/Others: Total","Donor: Comp of Emp","Donor: Goods/Service","Donor: Assets (Capital)","Donor: Total","Grand Total Less NREG / Statutory"]
        data = [[regionName,inputName,"ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR","ERR"]]
        data = pd.DataFrame(data,columns=cols)
        return data
import pandas as pd
import pdftableextract as pdf

pages = ["1"]
cells = [pdf.process_page("example.pdf",p) for p in pages]

#flatten the cells structure
cells = [item for sublist in cells for item in sublist ]

#without any options, process_page picks up a blank table at the top of the page.
#so choose table '1'
li = pdf.table_to_list(cells, pages)[1]

#li is a list of lists, the first line is the header, last is the footer (for this table only!)
#column '0' contains store names
#row '1' contains column headings
#data is row '2' through '-1'

data =pd.DataFrame(li[2:-1], columns=li[1], index=[l[0] for l in li[2:-1]])
Exemplo n.º 5
0
def transform(filename, console=None):
    write(console, "  - Parsing...", ending='')

    tablePdf   = pdf.PdfFileReader (file(filename, 'rb'))
    pages      = [ str(p) for p in range(1, len(tablePdf.pages) + 1)]

    # Procesing cells and flattern cells structure
    cells       = [pdftable.process_page(filename, p) for p in pages]
    cells       = [item for sublist in cells for item in sublist]

    table       = []

    write(console, "done")

    for page_table in pdftable.table_to_list(cells, pages):
 
        row_msg_format = "\r\033[K  - %d/%d records transformed"       
 
        page_table_rows = len(page_table)
        idx             = 0
 
        while idx < page_table_rows:
            write(console, row_msg_format % (idx, page_table_rows), ending='')

            cell_len       = sum([len(i) for i in page_table[idx]])

            if (cell_len == 0) or is_header_text(page_table[idx][0]):
                del page_table[idx]
                page_table_rows -= 1
            else:
                # Unicode for all
                for subidx in xrange(len(page_table[idx])):
                    page_table[idx][subidx] = unicode(page_table[idx][subidx], encoding='utf-8').strip()

                # Cases "1. medicament one 2. medicament two"
                if re.search(r'\d\.\W\W', page_table[idx][3]):
                    re_split_cases    = r'\W?\d+\.\W\W'
                    splitted_products = re.split(re_split_cases, page_table[idx][3])[1:]
                    splitted_vendors  = re.split(re_split_cases, page_table[idx][4])[1:]
                    added_products    = len(splitted_products)

                    if not len(splitted_vendors):
                        splitted_vendors = [ page_table[idx][4] ] * added_products

                    for i in xrange(0, added_products):
                        if i > 0: 
                            row_copy         = list(page_table[idx])
                            idx             += 1
                            page_table_rows += 1

                            page_table.insert(idx, row_copy)

                        page_table[idx][3] = unicode(splitted_products[i]).encode('utf-8')
                        page_table[idx][4] = unicode(splitted_vendors[i]).encode('utf-8')

                idx             += 1

        table += page_table

        write(console, row_msg_format % (idx, page_table_rows), ending='')
        
    write(console, '')

    return table
Exemplo n.º 6
0
def process(start, end):
    '''This method processes the specified results and populate necessary data
    structures.'''
    global result, exam
    badresult = []
    for count in range(start, end + 1):
        try:
            if verbosity == 1:
                print "Roll Number #", count
            else:
                sys.stdout.write(
                    "\r%.2f%%" % (float(count - start) * 100 / (end - start)))
                sys.stdout.flush()
            pages = ["1"]
            f = open("result" + str(count) + ".pdf", "rb")
            PdfFileReader(f)          # Checking if valid pdf file
            f.close()
            cells = [pdf.process_page("result" + str(count) + ".pdf", p)
                     for p in pages]
            cells = [item for sublist in cells for item in sublist]
            li = pdf.table_to_list(cells, pages)[1]
            for i in li:
                if 'Branch' in i[0]:
                    collegepos = i[0].index('College : ')
                    branchpos = i[0].index('Branch : ')
                    namepos = i[0].index('Name : ')
                    registerpos = i[0].index('Register No : ')
                    exampos = i[0].index('Exam Name : ')
                    college = i[0][collegepos:branchpos][9:].strip().title()
                    branch = i[0][branchpos:namepos][9:].strip().title()
                    exam = i[0][exampos:][11:].strip().title()
                    register = i[0][registerpos:exampos][13:].strip()
                    if college not in result:
                        result[college] = {}
                    if branch not in result[college]:
                        result[college][branch] = {}
                elif 'Mahatma' in i[0]:
                    pass
                elif 'Sl. No' in i[0]:
                    pass
                elif 'Semester Result' in i[1]:
                    pass
                else:
                    subject = [i][0][1]
                    internal = i[2]
                    external = i[3]
                    if internal == '-':
                        internal = 0
                    else:
                        internal = int(internal)
                    if external == '-':
                        external = 0
                    else:
                        external = int(external)
                    res = i[5]
                    if subject not in result[college][branch]:
                        result[college][branch][subject] = {}
                    result[college][branch][subject][register] = \
                        [external, res]
        except:
            badresult.append(count)
            continue
    if(len(badresult) > 0):
        print "\nUnavailable Results Skipped"
        for invalid in badresult:
            print "Roll Number #", invalid
    jsonout = json.dumps(result)
    outfile = open('output.json', 'w')
    outfile.write(jsonout)
    outfile.close()
    print ""
Exemplo n.º 7
0
    def process(self, start, end, parentfolder):
        '''
        This method processes the specified results and populate necessary data
        structures.
        '''
        self.badresult = []
        self.registers = {}
        self.subjects = {}
        result_pdf_path = os.path.join(parentfolder, 'Results')
        for count in range(start, end + 1):
            try:
                pages = ["1"]
                filename = "result" + str(count) + ".pdf"
                filepath = os.path.join(result_pdf_path, filename)
                f = open(filepath, "rb")
                PdfFileReader(f)          # Checking if valid pdf file
                f.close()
                cells = [pdf.process_page(filepath, p)
                         for p in pages]
                cells = [item for sublist in cells for item in sublist]
                li = pdf.table_to_list(cells, pages)[1]
                for i in li:
                    if 'Branch' in i[0]:
                        collegepos = i[0].index('College : ')
                        branchpos = i[0].index('Branch : ')
                        namepos = i[0].index('Name : ')
                        registerpos = i[0].index('Register No : ')
                        exampos = i[0].index('Exam Name : ')
                        college = i[0][collegepos:branchpos][
                            9:].strip().title()
                        branch = i[0][branchpos:namepos][9:].strip().title()
                        exam = i[0][exampos:][11:].strip().title()
                        register = i[0][registerpos:exampos][13:].strip()
                        name = i[0][namepos:registerpos][7:].strip()
                        if college not in self.result_subject:
                            self.result_subject[college] = {}
                        if college not in self.result_register:
                            self.result_register[college] = {}
                        if college not in self.registers:
                            self.registers[college] = {}
                        if branch not in self.result_subject[college]:
                            self.result_subject[college][branch] = {}
                        if branch not in self.result_register[college]:
                            self.result_register[college][branch] = {}
                        if branch not in self.registers[college]:
                            self.registers[college][branch] = []
                        if branch not in self.subjects:
                            self.subjects[branch] = []
                    elif 'Mahatma' in i[0]:
                        pass
                    elif 'Sl. No' in i[0]:
                        pass
                    elif 'Semester Result' in i[1]:
                        pass
                    else:
                        subject = [i][0][1]
                        internal = i[2]
                        external = i[3]
                        if internal == '-':
                            internal = 0
                        else:
                            internal = int(internal)
                        if external == '-':
                            external = 0
                        else:
                            external = int(external)
                        res = i[5]
                        if register not in self.registers[college][branch]:
                            self.registers[college][branch].append(register)
                        if subject not in self.subjects[branch]:
                            self.subjects[branch].append(subject)
                        if register not in self.result_register[
                                college][branch]:
                            self.result_register[college][
                                branch][register] = {}
                        self.result_register[college][
                            branch][register]["name"] = name
                        if subject not in self.result_register[
                                college][branch][register]:
                            self.result_register[college][
                                branch][register][subject] = {}
                        self.result_register[college][branch][register][subject] = \
                            [internal, external, internal + external, res]

                        if subject not in self.result_subject[college][branch]:
                            self.result_subject[college][branch][subject] = {}
                        self.result_subject[college][branch][subject][register] = \
                            [external, res]
                current = self.parent.progressbar2.value()
                unit = 100.0 / float(end - start)
                if current == -1:
                    current = 0
                self.parent.progressbar2.setValue(current + unit)
            except Exception as e:
                self.badresult.append(count)
                continue
        self.parent.progressbar2.setValue(100)
        jsonout = json.dumps(self.result_register, indent=4)
        json1path = os.path.join(parentfolder, 'output_register.json')
        outfile = open(json1path, 'w')
        outfile.write(jsonout)
        outfile.close()
        jsonout2 = json.dumps(self.result_subject, indent=4)
        json2path = os.path.join(parentfolder, 'output_subject.json')
        outfile2 = open(json2path, 'w')
        outfile2.write(jsonout2)
        outfile2.close()
        return self.badresult
Exemplo n.º 8
0
def transform(filename, console=None):
    write(console, "  - Parsing...", ending='')

    tablePdf = pdf.PdfFileReader(file(filename, 'rb'))
    pages = [str(p) for p in range(1, len(tablePdf.pages) + 1)]

    # Procesing cells and flattern cells structure
    cells = [pdftable.process_page(filename, p) for p in pages]
    cells = [item for sublist in cells for item in sublist]

    table = []

    write(console, "done")

    for page_table in pdftable.table_to_list(cells, pages):

        row_msg_format = "\r\033[K  - %d/%d records transformed"

        page_table_rows = len(page_table)
        idx = 0

        while idx < page_table_rows:
            write(console, row_msg_format % (idx, page_table_rows), ending='')

            cell_len = sum([len(i) for i in page_table[idx]])

            if (cell_len == 0) or is_header_text(page_table[idx][0]):
                del page_table[idx]
                page_table_rows -= 1
            else:
                # Unicode for all
                for subidx in xrange(len(page_table[idx])):
                    page_table[idx][subidx] = unicode(
                        page_table[idx][subidx], encoding='utf-8').strip()

                # Cases "1. medicament one 2. medicament two"
                if re.search(r'\d\.\W\W', page_table[idx][3]):
                    re_split_cases = r'\W?\d+\.\W\W'
                    splitted_products = re.split(re_split_cases,
                                                 page_table[idx][3])[1:]
                    splitted_vendors = re.split(re_split_cases,
                                                page_table[idx][4])[1:]
                    added_products = len(splitted_products)

                    if not len(splitted_vendors):
                        splitted_vendors = [page_table[idx][4]
                                            ] * added_products

                    for i in xrange(0, added_products):
                        if i > 0:
                            row_copy = list(page_table[idx])
                            idx += 1
                            page_table_rows += 1

                            page_table.insert(idx, row_copy)

                        page_table[idx][3] = unicode(
                            splitted_products[i]).encode('utf-8')
                        page_table[idx][4] = unicode(
                            splitted_vendors[i]).encode('utf-8')

                idx += 1

        table += page_table

        write(console, row_msg_format % (idx, page_table_rows), ending='')

    write(console, '')

    return table
Exemplo n.º 9
0
def proc(input, output):
	if os.path.exists(output) and os.stat(output).st_mtime > os.stat(input).st_mtime:
		return
	rs = pte.table_to_list(pte.process_page(input, "1"), 1)
	w = csv.writer(open(output, "w"))
	w.writerows(rs[1])