예제 #1
0
def txtToCSV(pdf, omit_pages):
    for i, item in enumerate(pdf):
        pages = layout.get_pages(item)
        text = []
        for index, page in enumerate(pages):
            if index+1 not in omit_pages[i]:
                text.append(page)
        text = ''.join(text)
        sents = sent_tokenize(text)
        wb = Workbook()
        ew = ExcelWriter(workbook=wb)
        dest_filename = item.split('.')[0]+'.xlsx'
        ws = wb.worksheets[0]
        for index, line in enumerate(sents):
            ws.cell(row=index, column=0).value = line
        ew.save(filename=dest_filename)
예제 #2
0
import layout
from nltk.tokenize import sent_tokenize
pages = layout.get_pages('Coffee Bean International.pdf')
text = []
for i in pages:
    text.append(i)
print type(text)
print text