def scrapePDFs(fullnames, profs, facconsulted): files = glob.glob("Candidates/*.pdf") havefiles = [",".join(os.path.split(f)[1].split("_")[:2]) for f in files] needfiles = [re.sub(r"[\s']", "", n) for n in fullnames] assert len(set(needfiles) - set(havefiles)) == 0, "Missing files for: {}".format( set(needfiles) - set(havefiles)) needfiles = np.array(needfiles) havefiles = np.array(havefiles) inds = np.array([np.where(havefiles == n)[0][0] for n in needfiles]) havefiles = havefiles[inds] files = np.array(files)[inds] out = np.zeros(files.shape, dtype=object) badpages = [] trans = str.maketrans(string.punctuation + "’\n", " " * (len(string.punctuation) + 2)) profstrans = np.array( [" {} ".format(prof.lower().translate(trans)) for prof in profs]) facconsulted = facconsulted.astype(str) facconsulted[facconsulted == "nan"] = "" # Perform layout analysis for all text laparams = pdfminer.layout.LAParams() setattr(laparams, "all_texts", True) for ii, fname in enumerate(files): print("%d/%d: %s" % (ii, len(files), fname)) rsrcmgr = PDFResourceManager() outfp = io.StringIO("") device = TextConverter(rsrcmgr, outfp, laparams=laparams) with open(fname, "rb") as fp: interpreter = PDFPageInterpreter(rsrcmgr, device) for j, page in enumerate( PDFPage.get_pages(fp, check_extractable=True)): try: interpreter.process_page(page) except: print("Unparsable page encountered.") if fname not in badpages: badpages.append(fname) txt = outfp.getvalue() device.close() outfp.close() txt = txt + " " + facconsulted[ii] + " " tmp = [] for jj, prof in enumerate(profstrans): if prof in txt.lower().translate(trans): tmp.append(profs[jj]) out[ii] = ", ".join(tmp) return out
def main(argv): def usage(): print(('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] ' '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] ' '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() debug = False # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None outdir = None layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug = True elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) if debug: set_debug_logging() rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = io.open(outfile, 'wt', encoding=codec, errors='ignore') close_outfp = True else: outfp = sys.stdout close_outfp = False if outtype == 'text': device = TextConverter(rsrcmgr, outfp, laparams=laparams) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, laparams=laparams, outdir=outdir) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp) else: return usage() for fname in args: fp = io.open(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() if close_outfp: outfp.close()
import re import numpy as np from bs4 import BeautifulSoup import bs4 as bs import urllib.request from delver import Crawler C = Crawler() CWD = os.getcwd() from io import StringIO import io rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() codec = 'utf-8' device = TextConverter(rsrcmgr, retstr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) def convert_pdf_to_txt(content): try: pdf = io.BytesIO(content.content) except: pdf = io.BytesIO(content) parser = PDFParser(pdf) document = PDFDocument(parser, password=None) # this fails write_text = ''
def end_page(self, i): from collections import defaultdict lines = defaultdict(lambda : {}) for child in self.cur_item._objs: #<-- changed if isinstance(child, LTChar): (_,_,x,y) = child.bbox line = lines[int(-y)] line[x] = child._text.encode(self.codec) #<-- changed for y in sorted(lines.keys()): line = lines[y] self.outfp.write("".join(line[x] for x in sorted(line.keys()))) self.outfp.write("\n") rsrc = PDFResourceManager() outfp = StringIO() device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams()) doc = PDFDocument() fp = open(case.pdf_name, 'rb') parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize('') interpreter = PDFPageInterpreter(rsrc, device) for i, page in enumerate(doc.get_pages()): outfp.write("START PAGE %d\n" % i)
def pdf_to_txt(pdf_name): # Variables sensitivity = 3 # Distance for lines to count as same line header = 750 # Cut off text above this height footer = 80 # Cutt off text below this height # Functions def order_pdf_textboxes(pdf_data, sensitivity, header, footer): df = pd.DataFrame(pdf_data) df.columns = ['x1', 'y1', 'x2', 'y2', 'Text'] df['x1'] = pd.to_numeric(df.x1) df['x2'] = pd.to_numeric(df.x2) df['y1'] = pd.to_numeric(df.y1) df['y2'] = pd.to_numeric(df.y2) df = splitDataFrameList(df, 'Text', '\n') df = df.sort_values(['y2_new'], ascending=False).reset_index(drop=True) df.insert(0, 'Group', range(-1, -1 - len(df), -1)) i = 0 for index, row in df.iterrows(): i = i + 1 try: if abs(df.iloc[index]['y2_new'] - df.iloc[index + 1]['y2_new']) < sensitivity: df.set_value(index, 'Group', i) df.set_value(index + 1, 'Group', i) except: pass df = df.sort_values(['x1'], ascending=True).reset_index(drop=True) df1 = df.groupby('Group', as_index=False).agg({ 'y2_new': 'first', 'x1': 'first' }) df = df.groupby([ 'Group' ])['Text'].apply(lambda x: ' '.join(x.astype(str))).reset_index() df['y2_new'] = df1['y2_new'] df = df.sort_values(['y2_new'], ascending=False) df = df[df.y2_new > footer] df = df[df.y2_new < header] return df['Text'].tolist() def splitDataFrameList(df, target_column, separator): def splitListToRows(row, row_accumulator, target_column, separator): split_row = row[target_column].split(separator) del split_row[-1] i = 0 for s in split_row: new_row = row.to_dict() new_row[target_column] = s line_height = (new_row['y2'] - new_row['y1']) / (len(split_row)) new_row['y2_new'] = new_row['y2'] - (i * line_height) new_row['y1_new'] = new_row['y2'] - ((i + 1) * line_height) i = i + 1 row_accumulator.append(new_row) new_rows = [] df.apply(splitListToRows, axis=1, args=(new_rows, target_column, separator)) new_df = pd.DataFrame(new_rows) return new_df def extract_from_element(x): text = x.get_text() text = re.sub('"', "'", str(text)) reps = ("\u201c", '"'), ("\u201d", '"'), ("\u2013", '-'), ("\u2019", "'"), ( "\uf06c", '-'), ("\uf06c", '-'), ("\u2122", '(TM)'), ("\uf0b7", '-'), ("\u01b7", '3'), ("\u0e00", ' '), ("(cid:149)", 'x') text = reduce(lambda a, kv: a.replace(*kv), reps, text) dims = str(x).split(' ')[1].split(',') return dims + [text] def list_to_txt(lists, fname): thefile = open(fname.replace(".pdf", ".txt"), 'w') for item in lists: item = str(item.encode("utf-8")) item = item[2:-1] thefile.write("%s\n" % item) # PDF extract code document = open(pdf_name, 'rb') #Create resource manager rsrcmgr = PDFResourceManager() # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pdf_full = [] #Loop through the pages for page in PDFPage.get_pages(document): pdf_data = [] interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() # Extract only the text objects for element in layout: if "LTTextBoxHorizontal" not in str(element): continue else: pdf_data.append(extract_from_element(element)) pdf_full = pdf_full + order_pdf_textboxes(pdf_data, sensitivity, header, footer) list_to_txt(pdf_full, pdf_name)
def extract_text_from_pdf(pdf_path): ''' Helper function to extract the plain text from .pdf files :param pdf_path: path to PDF file to be extracted (remote or local) :return: iterator of string of extracted text ''' # https://www.blog.pythonlibrary.org/2018/05/03/exporting-data-from-pdfs-with-python/ if not isinstance(pdf_path, io.BytesIO): # extract text from local pdf file with open(pdf_path, 'rb') as fh: try: for page in PDFPage.get_pages( fh, caching=True, check_extractable=True ): resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() converter = TextConverter( resource_manager, fake_file_handle, codec='utf-8', laparams=LAParams() ) page_interpreter = PDFPageInterpreter( resource_manager, converter ) page_interpreter.process_page(page) text = fake_file_handle.getvalue() yield text # close open handles converter.close() fake_file_handle.close() except PDFSyntaxError: return else: # extract text from remote pdf file try: for page in PDFPage.get_pages( pdf_path, caching=True, check_extractable=True ): resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() converter = TextConverter( resource_manager, fake_file_handle, codec='utf-8', laparams=LAParams() ) page_interpreter = PDFPageInterpreter( resource_manager, converter ) page_interpreter.process_page(page) text = fake_file_handle.getvalue() yield text # close open handles converter.close() fake_file_handle.close() except PDFSyntaxError: return
def _decodepdf(filename, debug=False): pdfstr = "" if debug == True: report("Debug mode enabled, reading inmates from rawfile.txt ...") with open("rawfile.txt", "r") as rawfile: pdfstr = rawfile.read() report("... Done reading inmates from disk.") else: rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) report("Converting PDF to text ...") fp = file(filename, 'rb') process_pdf(rsrcmgr, device, fp) fp.close() device.close() pdfstr = retstr.getvalue() retstr.close() report("... Done converting PDF to text.") report("Saving converted PDF text to rawfile.txt ...") with open("rawfile.txt", "w") as rawfile: rawfile.write(pdfstr) report("... Done saving converted PDF text.") report("Preprocessing PDF Text ...") labels = [ 'Book Dt:', 'Book Typ:', 'Cus Typ:', 'Bail:', 'Bond:', 'Court:', 'Judge:', 'Exp Rls:', 'Arr Agy:', 'Arr Typ:', 'ROC:', 'Chg:', 'Indict:', 'Adj Dt:', 'Term:' ] for label in labels: pdfstr = pdfstr.replace(label, "\n{0} ".format(label)) # preprocess to get rid of duplicate spaces and \n's pdfstr = re.sub(' +', ' ', pdfstr) pdfstr = re.sub('\n+', '\n', pdfstr) # handle current sentence going to next line # (time from custody date/time + first three capital leters form sentence type) pdfstr = re.sub('([0-9]{4})(?: )?\n([A-Z]{3})', '\\1 \\2', pdfstr) #ith open("rawfile2.txt","w") as rawfile: # rawfile.write(pdfstr) # handle inmate ID not being on same line as inmate data # (in some casses mcid, sex, race, and dob can all be on different lines ...) pdfstr = re.sub( '([0-9]{6})(?: )?(?:\n)?([A-Z])(?: )?(?:\n)?([A-Z])(?: )?(?:\n)?([0-9]{2}-[0-9]{2}-[0-9]{2})', '\\1 \\2 \\3 \\4', pdfstr) #ith open("rawfile3.txt","w") as rawfile: # rawfile.write(pdfstr) # remove page header pdfstr = re.sub( 'Current Census for Date: [0-9]{2}-[0-9]{2}-[0-9]{4}(?: )?(?:\n)?', '', pdfstr) pdfstr = re.sub('Name(?: )?(?:\n)?Location(?: )?(?:\n)?', '', pdfstr) pdfstr = re.sub('MCJ Sex Rce DOB(?: )?(?:\n)?', '', pdfstr) pdfstr = re.sub( 'Custody(?: )?(?:\n)?Time(?: )?(?:\n)?Date(?: )?(?:\n)?Classification(?: )?(?:\n)?', '', pdfstr) pdfstr = re.sub('Min(?: )?(?:\n)?Rel(?: )?(?:\n)?Date(?: )?(?:\n)?', '', pdfstr) # remove page footers pdfstr = re.sub('Facility:(?: )?(?:\n)?', '', pdfstr) pdfstr = re.sub('Page \d+ of \d+(?: )?(?:\n)?', '', pdfstr) pdfstr = re.sub('Printed:(?: )?\n([0-9]{2}-[0-9]{2}-[0-9]{4}) [0-9]{4}', '', pdfstr) # page-to-page formating issue pdfstr = re.sub('\x0C(?: )?(?:\n)?', '', pdfstr) # Note: probably not nessisary # post-process to get rid of duplicate spaces and \n's pdfstr = re.sub(' +', ' ', pdfstr) pdfstr = re.sub('\n+', '\n', pdfstr) #ith open("rawfile4.txt","w") as rawfile: # rawfile.write(pdfstr) report("... Done Preprocessing PDF Text.") return pdfstr, True
def get_text_from_pdf(pdfname, limit=1000): # PDFファイル名が未指定の場合は、空文字列を返して終了 if (pdfname == ''): return '' else: # 処理するPDFファイルを開く/開けなければ try: fp = open(pdfname, 'rb') except: return '' # PDFからテキストの抽出 rsrcmgr = PDFResourceManager() out_fp = StringIO() la_params = LAParams() la_params.detect_vertical = True device = TextConverter(rsrcmgr, out_fp, codec='utf-8', laparams=la_params) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos=None, maxpages=0, password=None, caching=True, check_extractable=True): interpreter.process_page(page) text = out_fp.getvalue() fp.close() device.close() out_fp.close() # 改行で分割する lines = text.splitlines() outputs = [] output = "" # 除去するutf8文字 replace_strs = [b'\x00'] is_blank_line = False # 分割した行でループ for line in lines: # byte文字列に変換 line_utf8 = line.encode('utf-8') # 余分な文字を除去する for replace_str in replace_strs: line_utf8 = line_utf8.replace(replace_str, b'') # strに戻す line = line_utf8.decode() # 連続する空白を一つにする line = re.sub("[ ]+", " ", line) # 前後の空白を除く line = line.strip() #print("aft:[" + line + "]") # 空行は無視 if len(line) == 0: is_blank_line = True continue # 数字だけの行は無視 if is_float(line): continue # 1単語しかなく、末尾がピリオドで終わらないものは無視 if line.split(" ").count == 1 and not line.endswith("."): continue # 文章の切れ目の場合 if is_blank_line or output.endswith("."): # 文字数がlimitを超えていたらここで一旦区切る if (len(output) > limit): outputs.append(output) output = "" else: output += "\r\n" #前の行からの続きの場合 elif not is_blank_line and output.endswith("-"): output = output[:-1] #それ以外の場合は、単語の切れ目として半角空白を入れる else: output += " " #print("[" + str(line) + "]") output += str(line) is_blank_line = False outputs.append(output) return outputs
def parse(fileName): text_path = upload_path + fileName + ".pdf" hmPdfSaveName = "" fileOpen = open(text_path, 'rb') doc = PDFDocument() parser = PDFParser(fileOpen) parser.set_document(doc) doc.set_parser(parser) doc.initialize() #原文件 hmPdfReaderEye = PyPDF2.PdfFileReader(fileOpen) hmPdfReaderHook = PyPDF2.PdfFileReader(fileOpen) #待写入数据文件 hmPdfWriter = PyPDF2.PdfFileWriter() #检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: #创建PDF,资源管理器,来共享资源 rsrcmgr = PDFResourceManager() #创建一个PDF设备对象 device = PDFPageAggregator(rsrcmgr, laparams=LAParams()) #创建一个PDF解释其对象 interpreter = PDFPageInterpreter(rsrcmgr, device) openFileArr = [] allPages = doc.get_pages() for page in allPages: interpreter.process_page(page) layout = device.get_result() textValueArr = [] for x in layout: if (isinstance(x, LTTextBoxHorizontal)): textValueArr.append(x.get_text()) pdfTxt = ''.join(textValueArr) textValueArr.clear() dpText = bgGetProductDp(pdfTxt) if dpText != 'CHE': #print('不是背钩部生产单') continue #init hmPd = HmProduct() #生产车间 hmPd.productDp = dpText #生产单编号 hmPd.productCasNum = bgGetProductInvoicesNum(pdfTxt) #产品编号 hmPd.productNum = bgGetProductNumber(pdfTxt, hmPd) #钩眼规格/排数/B数/尺寸 hmPd.productSf = bgGetProductSpecification(pdfTxt, hmPd) #销售单号 hmPd.productSealNum = bgGetProductSealNum(pdfTxt) #订单数量/单位 hmPd.productCount = getHmProductCount(pdfTxt, hmPd) #客人号 hmPd.productGuest = bgGetProductGuest(pdfTxt) #产品批次 hmPd.productBatch = bgGetProductBatch(pdfTxt) #产品中文描述/颜色备注/产品补充说明/钩眼布筒/钩眼切法 hmPd.productRamk = bgGetProductDetilRamk(pdfTxt, hmPd) #生成生产单uuid hmPd.hm_pd_uuid = bgGetPageMd5(hmPd) #处理特殊订单 getHmHookEyeIsSpecial(hmPd) #产品排数粒数 hmPd.productEPill = countHmPillx('E', hmPd) hmPd.productHPill = countHmPillx('H', hmPd) hmPd.productEYard = countHmYard(hmPd.productEPill, hmPd) hmPd.productHYard = countHmYard(hmPd.productHPill, hmPd) #产品钩眼公斤数 hmPd.productEyeKG = bgGetHookEyeKg('E', hmPd) hmPd.productHookKG = bgGetHookEyeKg('H', hmPd) #根据总表更新生产单日期 updateOutDate(hmPd, pdfTxt) #----------------1、生成文件_STAR----------------# payStr = '' layoutPageId = layout.pageid - 1 #生成【眼】单 if hmPd.clothTubeEye > 0: eyeNewPage = hmPdfReaderEye.getPage(layoutPageId) ePatch = hmCreateQRCode(hmPd, 'E', payStr) eMarkFile = open(ePatch, 'rb') pdfECodePage = PyPDF2.PdfFileReader(eMarkFile) eyeNewPage.mergePage(pdfECodePage.getPage(0)) hmPdfWriter.addPage(eyeNewPage) openFileArr.append(eMarkFile) del eyeNewPage del pdfECodePage gc.collect() #生成【钩】单 if hmPd.clothTubeHook > 0: hookNewPage = hmPdfReaderHook.getPage(layoutPageId) hPatch = hmCreateQRCode(hmPd, 'H', payStr) hMarkFile = open(hPatch, 'rb') pdfHCodePage = PyPDF2.PdfFileReader(hMarkFile) hookNewPage.mergePage(pdfHCodePage.getPage(0)) hmPdfWriter.addPage(hookNewPage) openFileArr.append(hMarkFile) del hookNewPage del pdfHCodePage gc.collect() #用销售单号做文件名 if hmPdfSaveName == "": hmPdfSaveName = hmPd.productSealNum #----------------1、生成文件_END----------------# #完结时关闭文件和保存文件 #----------------生成文件时关闭----------------# nowTime = datetime.datetime.now() nowTimeStr = nowTime.strftime("%Y%m%d%H%M%S_s") hmPdfSaveName = nowTimeStr + "_" + hmPdfSaveName + ".pdf" hmPdfSavePath = download_path + hmPdfSaveName resultPdfFile = open(hmPdfSavePath, 'wb') hmPdfWriter.write(resultPdfFile) for closeItem in openFileArr: closeItem.close() os.remove(closeItem.name) openFileArr.clear() resultPdfFile.close() fileOpen.close() return hmPdfSaveName
def ConvertPdf(pdfpath, outfp, opts={}): import sys from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfparser import PDFParser from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfdevice import PDFDevice, TagExtractor from pdfminer.pdfpage import PDFPage from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter from pdfminer.cmapdb import CMapDB from pdfminer.layout import LAParams from pdfminer.image import ImageWriter debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-R': rotation = int(v) elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # CMapDB.debug = debug PDFResourceManager.debug = debug PDFDocument.debug = debug PDFParser.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager() if not outtype: outtype = 'txt' if outtype == 'txt': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) fp = file(pdfpath, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() return True
def readfile(file): try: if file.startswith('https://') or file.startswith( 'http://') or file.startswith('ftp://'): data = BytesIO(download(file)) else: data = open(file, 'rb') if file.endswith('.caj') or file.endswith('.pdf'): with StringIO() as outfp: rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, outfp) process_pdf(rsrcmgr, device, data) return outfp.getvalue() elif file.endswith('.doc'): text = '' document = olefile.OleFileIO(data) wordDocument = document.openstream('WordDocument').read() # Parsing the WordDocument Stream # See https://msdn.microsoft.com/en-us/library/office/dd904907(v=office.14).aspx # And http://b2xtranslator.sourceforge.net/howtos/How_to_retrieve_text_from_a_binary_doc_file.pdf # Loading the FIB fib = wordDocument[:1472] # Loading and Parsing the piece table fcClx = int.from_bytes(fib[0x01A2:0x01A5], byteorder='little') lcbClx = int.from_bytes(fib[0x01A6:0x01A9], byteorder='little') tableFlag = ((int.from_bytes( fib[0x000A:0x000E], byteorder='little') & 0x0200) == 0x0200) tableName = ('0Table', '1Table')[tableFlag] table = document.openstream(tableName).read() clx = table[fcClx:fcClx + lcbClx] pos = 0 pieceTable = '' lcbPieceTable = 0 while True: if clx[pos] == 2: # this entry is the piece table lcbPieceTable = int.from_bytes(clx[pos + 1:pos + 5], byteorder='little') pieceTable = clx[pos + 5:pos + 5 + lcbPieceTable] break elif clx[pos] == 1: # skip this entry pos = pos + 1 + 1 + ord(clx[pos + 1]) else: break i = 1 pieceCount = (lcbPieceTable - 4) / 12 while i <= pieceCount: cpStart = int.from_bytes(pieceTable[i * 4:i * 4 + 4], byteorder='little') cpEnd = int.from_bytes(pieceTable[(i + 1) * 4:(i + 1) * 4 + 4], byteorder='little') offsetPieceDescriptor = int(((pieceCount + 1) * 4) + (i * 8)) pieceDescriptor = pieceTable[ offsetPieceDescriptor:offsetPieceDescriptor + 8] fcValue = int.from_bytes(pieceDescriptor[2:6], byteorder='little') isANSII = (fcValue & 0x40000000) == 0x40000000 fc = fcValue & 0xBFFFFFFF encoding = ('utf-16', 'cp1252')[isANSII] cb = cpEnd - cpStart cb = (cb * 2, cb)[isANSII] text += wordDocument[fc:fc + cb].decode(encoding) i += 1 return text elif file.endswith('.docx'): text = '' document = Document(data) text += '\n\n'.join( [paragraph.text for paragraph in document.paragraphs]) for table in document.tables: text += _parse_docx_table(table, text) return text elif file.endswith('.htm') or file.endswith('.html'): html = html2text.HTML2Text() html.ignore_links = True return html.handle(data.read().decode('utf-8')) elif file.endswith('.rtf'): with BytesIO() as outfp: document = Rtf15Reader.read(data) return PlaintextWriter.write(document, outfp).getvalue() elif file.endswith('.txt'): return data.read() else: raise Exception('Unknown file extension') except: pass
#3、创建文件解析器 #具体的代码含义,都进行了注释 #创建一个PDF文档解析器对象 try: parser = PDFParser(fp) #创建一个PDF文档对象存储文档结构 #提供密码初始化,没有就不用传该参数 #document = PDFDocument(parser, password) document = PDFDocument(parser) #检查文件是否允许文本提取 if not document.is_extractable: raise PDFTextExtractionNotAllowed #创建一个PDF资源管理器对象来存储共享资源 #caching = False不缓存 rsrcmgr = PDFResourceManager(caching=False) # 创建一个PDF设备对象 laparams = LAParams() # 创建一个PDF页面聚合对象 device = PDFPageAggregator(rsrcmgr, laparams=laparams) #创建一个PDF解析器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) #处理文档当中的每个页面 # doc.get_pages() 获取page列表 #for i, page in enumerate(document.get_pages()): #PDFPage.create_pages(document) 获取page列表的另一种方式 replace = re.compile(r'\s+') # 循环遍历列表,每次处理一个page的内容 for page in PDFPage.create_pages(document): interpreter.process_page(page)
#fp = urlopen("url") #创建一个与文档关联的解释器 parser = PDFParser(fp) #创建PDF文档对象 doc = PDFDocument() #链接解释器和文档对象 parser.set_document(doc) doc.set_parser(parser) #初始化文档 doc.initialize("") #创建PDF资源管理器 resource = PDFResourceManager() #参数分析器 laparam = LAParams() #PDF聚合器 device = PDFPageAggregator(resource, laparams=laparam) #创建PDF页面解释器 interpreter = PDFPageInterpreter(resource, device) #使用文档对象得到页面的集合 for page in doc.get_pages(): #使用页面解释器读取 interpreter.process_page(page)
def get_transaction_list(self, address): # Create file pointer fp = open(address, 'rb') # Create parser object to parse the pdf content parser = PDFParser(fp) # Store the parsed content in PDFDocument object document = PDFDocument(parser, '') # Create PDFResourceManager object that stores shared resources such as fonts or images rsrcmgr = PDFResourceManager() # set parameters for analysis laparams = LAParams() # Create a PDFDevice object which translates interpreted information into desired format # Device needs to be connected to resource manager to store shared resources # device = PDFDevice(rsrcmgr) # Extract the decive to page aggregator to get LT object elements device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create interpreter object to process page content from PDFDocument # Interpreter needs to be connected to resource manager for shared resources and device interpreter = PDFPageInterpreter(rsrcmgr, device) # Ok now that we have everything to process a pdf document, lets process it page by page for page in PDFPage.create_pages(document): # As the interpreter processes the page stored in PDFDocument object interpreter.process_page(page) # The device renders the layout from interpreter layout = device.get_result() # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine for lt_obj in layout: if isinstance(lt_obj, LTTextBox): self.fetch_chars(lt_obj) if isinstance(lt_obj, LTTextLine): self.fetch_chars(lt_obj) self.page_num += 1 fp.close() # Since, we always read form left to right, optimize data to be read from left to right. # This would help us find the location of different headers. It is also required to find co-ordinates remapped = defaultdict(lambda: defaultdict(list)) for char_text in self.extracted_text: remapped[char_text['page_num']][char_text['bbox'][1]].append(char_text) overall_end_marker = hdfc_parsing_spec['transactions']['overall_end_row'] transactions = [] current_transaction = { 'txn_date': '', 'txn_msg': '', 'debit_amount': '', 'credit_amount': '', 'running_balance': '' } for page_num in range(1, len(remapped) + 1): dataset = [] for key, val in remapped[page_num].items(): dataset.append((key, val)) dataset = sorted(dataset, key=lambda x: -x[0]) if page_num in hdfc_parsing_spec['transactions']['page_conf']: start_marker = hdfc_parsing_spec['transactions']['page_conf'][page_num]['start_row'] end_marker = hdfc_parsing_spec['transactions']['page_conf'][page_num]['end_row'] else: start_marker = hdfc_parsing_spec['transactions']['page_conf']['default']['start_row'] end_marker = hdfc_parsing_spec['transactions']['page_conf']['default']['end_row'] has_started = False constructed_string = '' for row in dataset: row_sorted = sorted(row[1], key=lambda x: x['bbox'][0]) constructed_string = '' prev = row_sorted[0]['bbox'][0] for entry in row_sorted: # print(entry, end='\n\n\n\n') if abs(prev - entry['bbox'][0]) < 1e-3: constructed_string += entry['text'] else: constructed_string += ' ' + entry['text'] prev = entry['bbox'][2] # print(constructed_string) if end_marker in constructed_string: break if constructed_string == overall_end_marker: for key, val in current_transaction.items(): if isinstance(val, list): if len(val) == 0: current_transaction[key] = '' continue part = '' prev = val[0]['bbox'][0] for entry in val: if abs(prev - entry['bbox'][0]) < 1e-3: part += entry['text'] else: part += ' ' + entry['text'] prev = entry['bbox'][2] current_transaction[key] = part transactions.append(current_transaction) break if has_started: # print(constructed_string) if len(constructed_string) >= 8: try: curr_date = datetime.datetime.strptime( constructed_string[:hdfc_parsing_spec['transactions']['date_format']['length']], hdfc_parsing_spec['transactions']['date_format']['date_string'] ) if current_transaction['txn_msg']: for key, val in current_transaction.items(): if isinstance(val, list): if len(val) == 0: current_transaction[key] = '' continue part = '' prev = val[0]['bbox'][0] for entry in val: if abs(prev - entry['bbox'][0]) < 1e-3: part += entry['text'] else: part += ' ' + entry['text'] prev = entry['bbox'][2] current_transaction[key] = part transactions.append(current_transaction) current_transaction = { 'txn_date': curr_date, 'txn_msg': [], 'debit_amount': [], 'credit_amount': [], 'running_balance': [], 'cheque_no': [] } except: pass for entry in row_sorted: for key, val in hdfc_parsing_spec['transactions']['cols_conf'].items(): if val[0] <= entry['bbox'][0] <= val[1]: current_transaction[key].append(entry) if constructed_string == start_marker: has_started = True if constructed_string == overall_end_marker: break initial_balance = ( self.parse_float(transactions[0]['running_balance']) + self.parse_float(transactions[0]['debit_amount']) - self.parse_float(transactions[0]['credit_amount']) ) for txn in transactions: new_balance = initial_balance - self.parse_float(txn['debit_amount']) + self.parse_float( txn['credit_amount']) if abs(new_balance - self.parse_float(txn['running_balance'])) > 1E-6: print(new_balance) print(self.parse_float(txn['running_balance'])) print('>>> Date: {:%d-%b-%Y}, Msg: {}, Debit: {}, Credit: {}, Balance: {}'.format( txn['txn_date'], txn['txn_msg'], txn['debit_amount'], txn['credit_amount'], txn['running_balance'] )) initial_balance = new_balance transaction_list = [] for transaction in transactions: if transaction['credit_amount'] == '': amount = -1 * float(transaction['debit_amount'].replace(',', '')) else: amount = float(transaction['credit_amount'].replace(',', '')) transaction_list_single = { 'date': transaction['txn_date'], 'chqNo': '', 'balance': float(transaction['running_balance'].replace(',', '')), 'narration': transaction['txn_msg'], 'amount': amount } transaction_list.append(transaction_list_single) print(len(transaction_list)) return transaction_list
def parse(_path): # fp = open(_path, 'rb') # rb以二进制读模式打开 local读取 request = Request(url=_path, headers={'User-Agent': random.choice(user_agent) }) # 随机从user_agent列表中抽取一个元素 fp = urlopen(request) # 用文件对象来创建一个pdf文档分析器 praser_pdf = PDFParser(fp) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器 与文档对象 praser_pdf.set_document(doc) doc.set_parser(praser_pdf) # 提供初始化密码 # 如果没有密码 就创建一个空的字符串 doc.initialize() # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDf资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF参数分析器 laparams = LAParams() # 创建聚合器 device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF页面解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 循环遍历列表,每次处理一页的内容 # doc.get_pages() 获取page列表 for page in doc.get_pages(): # 使用页面解释器来读取 interpreter.process_page(page) # 使用聚合器获取内容 layout = device.get_result() # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性, for out in layout: # 判断是否含有get_text()方法,图片之类的就没有 # if hasattr(out,"get_text"): if isinstance(out, LTTextBoxHorizontal): results = out.get_text() # print("results: " + results) with open(r'pdf_val.txt', 'a') as f: if "运输完成情况" in results: target_value = results.split("\n") inland_amount = target_value[10] foreign_amount = target_value[12] print("国内货邮运输量:", inland_amount, "国际货邮运输量:", foreign_amount) f.write("国内货邮运输量:" + inland_amount + ",国际货邮运输量:" + foreign_amount + "\n") f.close()
def text_from_pdf(pdf_path, authors): extracted_text = "" affiliations = dict() author_lastnames = set() author_lastname_pattern = "" for author in authors: lastname = get_last_name(author[1]) print("Lastname:", lastname) author_lastnames.add(lastname) author_lastname_pattern += lastname + "\s*,?\s*|" author_lastname_pattern = "(" + author_lastname_pattern[:-1] + ")" print("Author lastname pattern:", author_lastname_pattern) # Create a PDF parser object associated with the file object. infp = open(pdf_path, "rb") parser = PDFParser(infp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser) # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. rsrcmgr = PDFResourceManager(caching=True) device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(document): interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): tmp_text = re.sub("[^A-Za-z0-9 \s\.@]", "", lt_obj.get_text()) author_count = 0 for lastname in author_lastnames: if re.search(lastname, tmp_text): author_count += 1 if author_count > 0: print("Author text block: ", tmp_text) tmp_pattern = author_lastname_pattern + "{" + str( author_count) + "}(.*)$" affiliation_block = re.search(tmp_pattern, tmp_text, re.DOTALL) if affiliation_block: print("Groups: ", affiliation_block.groups()) for lastname in author_lastnames: if (not lastname in affiliations) and re.search( lastname, tmp_text): if (len(affiliation_block.groups()) > author_count): affiliations[lastname] = re.sub( "\s+", " ", affiliation_block.group(author_count + 1)) else: affiliations[lastname] = re.sub( "\s+", " ", tmp_text) extracted_text += tmp_text + "\n" infp.close() device.close() if os.path.exists("working/temp"): os.remove("working/temp") outfp = open(temp_path, "w", encoding="utf-8") outfp.write(extracted_text) outfp.close() #os.remove(temp_path) return (extracted_text, affiliations)
def convert_to_txt(self, data): extension = os.path.splitext(data)[1].lower() if extension == '.pdf': fp = file(data, 'rb') fname = data rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.get_pages(fp): interpreter.process_page(page) data = retstr.getvalue() with open(os.path.splitext(fname)[0] + '.txt', 'wb') as output_file: output_file.write(data) return os.path.splitext(os.path.split(fname)[1])[0] + '.txt' elif extension == '.docx': document = docx.Document(data) text = [paragraph.text for paragraph in document.paragraphs] text.extend([ cell.text for table in document.tables for row in table.rows for cell in row.cells ]) with open(os.path.splitext(data)[0] + '.txt', 'wb') as output_file: output_file.write('\n'.join( [c for x in text for c in x if 32 <= ord(c) <= 127])) return os.path.splitext(data)[0] + '.txt' elif extension == '.doc': with open(os.path.splitext(data)[0] + '.txt', 'wb') as output_file: output_file.write("\n".join( [x for x in doc2text(data) if 32 <= ord(x) <= 127])) return os.path.splitext(data)[0] + '.txt' elif extension == '.xlsx': wb = openpyxl.Workbook(data) text = [cell.value for ws in wb for cell in ws.rows] with open(os.path.splitext(data)[0] + '.txt', 'wb') as output_file: output_file.write("\n".join( [x for x in text if 32 <= ord(x) <= 127])) return os.path.splitext(data)[0] + '.txt' elif extension == '.xls': wb = xlrd.open_workbook(data) text = [ cell.value for ws in wb.sheets() for row in ws.get_rows() for cell in row ] with open(os.path.splitext(data)[0] + '.txt', 'wb') as output_file: output_file.write("\n".join([str(x) for x in text])) return os.path.splitext(data)[0] + '.txt'
def get_paper_info_from_pdf(data): fp = BytesIO(data) # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. doc = PDFDocument() # Connect the parser and document objects. parser.set_document(doc) doc.set_parser(parser) # Initialize doc.initialize() # Extract the metadata for xref in doc.xrefs: info_ref = xref.trailer.get('Info') if info_ref: info = resolve1(info_ref) paper_info = {} if info: authors = info.get('Author') if authors: if ';' in authors: author_list = authors.split(';') elif ' AND ' in authors: author_list = authors.split(' AND ') elif ',' in authors: #FIXME: This cuts 'LastName, FirstName' in two... author_list = authors.split(',') else: author_list = [authors] paper_info['authors'] = author_list title = info.get('Title') if title: # Some PDFs have the doi as a title if title.lower().startswith('doi:'): paper_info['doi'] = title[4:] else: paper_info['title'] = title #TODO: Additional metadata? #TODO: What about embedded BibTeX (as done by JabRef)? #Extract text rsrcmgr = PDFResourceManager() content = cStringIO.StringIO() device = TextConverter(rsrcmgr, content, codec='utf-8', laparams=LAParams()) process_pdf(rsrcmgr, device, fp, check_extractable=True, caching=True) paper_info['extracted_text'] = content.getvalue() if not 'doi' in paper_info: # Try to find a DOI in the text doi = p_doi.search(paper_info['extracted_text']) if doi is not None: doi = doi.group(1) log_debug('Found a DOI: %s' % doi) paper_info['doi'] = doi device.close() content.close() log_debug('Exctracted paper_info from PDF: %s' % paper_info) return paper_info
def read_pdf(file, range_set, output_file): sku = [] fnsku = [] pieces = [] units = [] cases = [] total = [] print(file) fp = open(file, 'rb') rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = PDFPage.get_pages(fp) for page in pages: print('Processing next page...') interpreter.process_page(page) layout = device.get_result() for lobj in layout: if isinstance(lobj, LTTextBox): x, y, text = lobj.bbox[0], lobj.bbox[3], lobj.get_text( ) # 595 W X 842 H pixels # print('At %r is text: %s' % ((x, y), text)) output_file.write('At %r is text: %s' % ((x, y), text)) # sku import re if int(x) in range(35, 50): if len(text) >= 6: sku.append(text) seg = text.split('-') pieces.append(re.sub("[^0-9]", "", seg[1])) # print('At %r is text: %s' % ((x, y), text)) # fnsku if int(x) in range(85, 120): if text[-11:][0:2] == 'X0' or text[-11:][0:2] == 'B0': fnsku.append(text[-11:]) # print('At %r is text: %s' % ((x, y), text)) # If SKU does not contain pieces-info, use below # # if 'pack' in text: # pos = text.index('pack') # pieces.append(text[pos-3:pos]) # elif 'pcs' in text: # pos = text.index('pcs') # pieces.append(text[pos-3:pos]) # elif 'pack of' in text: # pos = text.index('pack of') # pieces.append(text[pos + 8:pos + 9]) # else: # pieces.append(1) # units if int(x) in range(xrange[range_set][0][0], xrange[range_set][0][1]): units.append(text) # print('At %r is text: %s' % ((x, y), text)) # cases if int(x) in range(xrange[range_set][1][0], xrange[range_set][1][1]): cases.append(text) # print('At %r is text: %s' % ((x, y), text)) # total if int(x) in range(xrange[range_set][2][0], xrange[range_set][2][1]): total.append(text) # print('At %r is text: %s' % ((x, y), text)) fp.close() return sku, fnsku, pieces, units, cases, total
def main(args): input_file = args[1] output_file = args[2] print args fp = open(input_file, 'rb') #with open('test.pdf','wb') as s: # s.write(urllib2.urlopen(f).read()) #fp = open('test.pdf', 'rb') filename = os.path.split(input_file)[1].split('.')[0] # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. laparams = LAParams() laparams.detect_vertical = True # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) root = ET.Element('xml') intro = ET.SubElement(root, 'intro') title = ET.SubElement(root, 'title') subtitle = ET.SubElement(root, 'subtitle') body = ET.SubElement(root, 'body') section = ET.SubElement(root, 'section') body.text = intro.text = title.text = subtitle.text = section.text = ' ' tree = ET.ElementTree(root) global fonts, layout, images_list, filename fonts = Counter([]) images_list = [] all_objs = [] for n, page in enumerate(PDFPage.create_pages(document)): #if n <> 0: continue print n objs = [] objs_r = objs_l = '' interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() print n, layout.width, layout.height, layout.width / (layout.height * 1.0) # collecting objects from the all pages, sorting them by their Y coordinate objs.append( get_objects(layout) ) #sorted( get_objects(layout),key=lambda x:x.y0,reverse=True) ) objs = objs[0] #sum(objs,[]) objs = [ i for i in objs if layout.height * 0.05 <= i.y0 <= layout.height - layout.height * 0.05 ] all_objs.append(objs) fonts += get_fonts(objs) for n, objs in enumerate(all_objs): #determines if page is actually has two pages #if layout.width/(layout.height*1.0) > 0.8: # print 'aaa' # objs_l = [i for i in sorted(objs,key = lambda x:x.x0) if i.x0<= layout.width/2] # objs_r = [i for i in sorted(objs,key = lambda x:x.x0) if i.x0> layout.width/2] tree = make_xml(objs, n, tree) print 'end' soup = BeautifulSoup( HTMLParser().unescape( ET.tostring(tree, encoding='unicode', method='xml')).replace( '&', '&').replace(' >', '>').replace('< ', '<'), 'xml') with open(output_file, 'wb') as f: f.write(str(soup.prettify().encode('utf-8')))
def extractpdf(ta_from, endstatus, **argv): ''' Try to extract text content of a PDF file to a csv. You know this is not a great idea, right? But we'll do the best we can anyway! Page and line numbers are added to each row. Columns and rows are based on the x and y coordinates of each text element within tolerance allowed. Multiple text elements may combine to make one field, some PDFs have every character separated! You may need to experiment with x_group and y_group values, but defaults seem ok for most files. Output csv is UTF-8 encoded - The csv module doesn't directly support reading and writing Unicode If the PDF is just an image, all bets are off. Maybe try OCR, good luck with that! Mike Griffin 14/12/2011 ''' from pdfminer.pdfinterp import PDFResourceManager, process_pdf from pdfminer.converter import TextConverter from pdfminer.layout import LAParams, LTContainer, LTText, LTTextBox import csv class CsvConverter(TextConverter): def __init__(self, *args, **kwargs): TextConverter.__init__(self, *args, **kwargs) def receive_layout(self, ltpage): # recursively get every text element and it's coordinates def render(item): if isinstance(item, LTContainer): for child in item: render(child) elif isinstance(item, LTText): (unused1, unused2, x, y) = item.bbox # group the y values (rows) within group tolerance for v in yv: if y > v - y_group and y < v + y_group: y = v yv.append(y) line = lines[int(-y)] line[x] = item.get_text().encode('utf-8') from collections import defaultdict lines = defaultdict(lambda: {}) yv = [] render(ltpage) lineid = 0 for y in sorted(lines.keys()): line = lines[y] lineid += 1 csvdata = [ltpage.pageid, lineid] # first 2 columns are page and line numbers # group the x values (fields) within group tolerance p = 0 field_txt = '' for x in sorted(line.keys()): gap = x - p if p > 0 and gap > x_group: csvdata.append(field_txt) field_txt = '' field_txt += line[x] p = x csvdata.append(field_txt) csvout.writerow(csvdata) if lineid == 0: raise botslib.InMessageError( _('PDF text extraction failed, it may contain just image(s)?' )) #get some optional parameters x_group = argv.get('x_group', 10) # group text closer than this as one field y_group = argv.get('y_group', 5) # group lines closer than this as one line password = argv.get('password', '') quotechar = argv.get('quotechar', '"') field_sep = argv.get('field_sep', ',') escape = argv.get('escape', '\\') charset = argv.get('charset', 'utf-8') if not escape: doublequote = True else: doublequote = False try: pdf_stream = botslib.opendata_bin(ta_from.filename, 'rb') ta_to = ta_from.copyta(status=endstatus) tofilename = unicode(ta_to.idta) csv_stream = botslib.opendata_bin(tofilename, 'wb') csvout = csv.writer(csv_stream, quotechar=quotechar, delimiter=field_sep, doublequote=doublequote, escapechar=escape) # Process PDF rsrcmgr = PDFResourceManager(caching=True) device = CsvConverter(rsrcmgr, csv_stream, codec=charset) process_pdf(rsrcmgr, device, pdf_stream, pagenos=set(), password=password, caching=True, check_extractable=True) device.close() pdf_stream.close() csv_stream.close() filesize = os.path.getsize(botslib.abspathdata(tofilename)) ta_to.update( statust=OK, filename=tofilename, filesize=filesize) # update outmessage transaction with ta_info; botsglobal.logger.debug(_(' File written: "%(tofilename)s".'), {'tofilename': tofilename}) except: txt = botslib.txtexc() botsglobal.logger.error( _('PDF extraction failed, may not be a PDF file? Error:\n%(txt)s'), {'txt': txt}) raise botslib.InMessageError( _('PDF extraction failed, may not be a PDF file? Error:\n%(txt)s'), {'txt': txt})
def run(filepath): import getopt # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = 'pdfparser/minute_store/minutes.txt' outtype = 'text' imagewriter = None rotation = 0 stripcontrol = False layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if outfile: outfp = open(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) fp = open(filepath, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() print("Converted PDF to Text") return
def lambda_handler(event, context): # Grab file that was just uploaded to S3 bucket's "pdf" directory bucket = event['Records'][0]['s3']['bucket']['name'] s3_new_arrived_filename = urllib.unquote_plus( event['Records'][0]['s3']['object']['key'].encode('utf8')) print('Reading file ' + s3_new_arrived_filename + ' from S3') extracted_results_from_pdf = '/tmp/extract.xml' downloaded_pdf_file = '/tmp/input.pdf' #download file into /tmp s3.meta.client.download_file(bucket, s3_new_arrived_filename, downloaded_pdf_file) print('Downloaded file ' + s3_new_arrived_filename + ' from S3') # extract pdf into xml and upload xml to S3 bucket's "xml" directory resource_mgr = PDFResourceManager() retstr = BytesIO() codec = 'utf-8' laparams = LAParams() device = XMLConverter(resource_mgr, retstr, codec=codec, laparams=laparams) maxpages = 0 caching = True pagenos = set() infile_pdf_fp = file(downloaded_pdf_file, 'rb') interpreter = PDFPageInterpreter(resource_mgr, device) for page in PDFPage.get_pages(infile_pdf_fp, pagenos, maxpages=maxpages, password='', caching=caching, check_extractable=True): interpreter.process_page(page) data = retstr.getvalue() # xml data extracted from pdf device.close() retstr.close() # write xml (extracted from pdf) to a new file print('Opening file ' + extracted_results_from_pdf + ' to write extracted xml from ' + s3_new_arrived_filename) outfile_xml_fp = file(extracted_results_from_pdf, 'w') print('Opened file ' + extracted_results_from_pdf) outfile_xml_fp.write(data) # pdfminer has a bug wherein it misses out the last </pages> tag in the extracted xml. Hence, adding this last tag manually. # Bug reported: https://github.com/euske/pdfminer/issues/229 outfile_xml_fp.write("</pages>") outfile_xml_fp.close() filename_without_folderprefix_and_ext = re.sub( r'.*/', '', os.path.splitext(s3_new_arrived_filename)[0]) extracted_xml_filename_in_s3 = 'xml/' + filename_without_folderprefix_and_ext + '.xml' s3.meta.client.upload_file(extracted_results_from_pdf, bucket, extracted_xml_filename_in_s3) # Publish to "StockDataExtracted" SNS topic. Send location of newly extracted XML in S3 in the message to SNS topic. This topic triggers the next lambda function - get_recommended_stocks message = {"topten_trader_xml_filepath": extracted_xml_filename_in_s3} sns_client = boto3.client('sns', region_name='us-east-1') sns_response = sns_client.publish( TargetArn='arn:aws:sns:us-east-1:<aws_account_#>:stock_data_extracted', Message=json.dumps({'default': json.dumps(message)}), Subject='Stock Buy Recommendations ' + str(datetime.date.today()), MessageStructure='json')
def getEntityPDFJson(self): #searchable = isSearchablePDF() counter = 1 print("PDF File") fp = open(self.filename, 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.char_margin = 1.0 laparams.word_margin = 1.0 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) extracted_text = '' isEnglish = True relationsList = [] uniqueEntities = [] for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance( lt_obj, LTTextLine): if (self.lang not in "eng"): extracted_text = Translator.translate_( lt_obj.get_text(), self.credentials) else: extracted_text = lt_obj.get_text() n1 = extracted_text.replace("\t", " ") n2 = n1.replace("\r", "") n3 = n2.replace("\n", "") finaltext = n3.replace("\u00a0", "") doc = nlp(finaltext) paras = extracted_text.split("\n\n") for p in paras: for line in p.split("\n"): output = self.relationExtractor.annotate( line, properties={ "annotators": "tokenize,ssplit,pos,depparse,natlog,openie", "outputFormat": "json", "openie.triple.strict": "true", "openie.max_entailments_per_clause": "1" }) if (output != None): if (len(output["sentences"]) > 0): result = [ output["sentences"][0]["openie"] for item in output ] # print(len(result)) for i in result: for rel in i: relationSent = rel['subject'], rel[ 'relation'], rel['object'] print(relationSent) relation = RelationTriple( rel['subject'], rel['object'], rel['relation'], self.document_url) relationsList.append(relation) del relation for X in doc.ents: if X.text != ('\n') and X.label_ not in ( 'ORDINAL', 'CARDINAL', 'NORP', 'Non-‐binding'): if (self.isAlreadyThere(uniqueEntities, X.text) == False): self.listEntities.append(X.text + ",") uniqueEntities.append(X.text) print(self.listEntities) print("Relations Count:" + str(len(relationsList))) self.insertRelation(relationsList) return self.listEntities
def extract(pdf_path): resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() converter = TextConverter(resource_manager, fake_file_handle) page_interpreter = PDFPageInterpreter(resource_manager, converter) with open(pdf_path, 'rb') as fh: for pg in PDFPage.get_pages(fh, caching=True, check_extractable=True): page_interpreter.process_page(pg) text = fake_file_handle.getvalue() converter.close() fake_file_handle.close() print(text) #tender notice tn=re.search(r'Reference.*?Tender',text).group() tn1=re.search(r'Number.*?Tender',tn).group() tn1=tn1.replace('Number','') tn1=tn1.replace('Tender','') #tnotice=re.search(r'\d{2}(.)\d{2}(-)\d{2}',tn).group() print('Tender notice no :-',tn1) #tender type ty=re.search(r"Reference.{200}",text).group() tenderty=re.search(r"Type.*?Form",ty).group().replace("Type",'').replace("Form",'').replace(" ",'') print("Tender Type :-",tenderty) #tender catagory tenderc=re.search(r"Category.*?No",text).group().replace("Category",'').replace("No",'').replace(" ",'') print("Tender Category :-",tenderc) #ernest money emd=re.search(r"Amount.*?EMD",text).group() ernest=re.search(r"₹.*?E",emd).group().replace('E','').replace(' ','') print("Earnest Money :- ",ernest) #Published date pd=re.search(r'Published.*?Bid',text).group() publishdate=re.search(r'Date.*?Bid',pd).group().replace("Date",'').replace("Bid",'')#.replace(" ","") print("Published date :-",publishdate) #Tender value tv=re.search(r'Value.*?Product',text).group() tenderval=re.search(r'₹.*?P',tv).group().replace('P','') print("Tender value :-",tenderval) #Tenderfee tf=re.search(r'Total.*?Payable',text).group() tf1=re.search(r"Tender.*?P",tf).group() tenderfee=re.search(r"₹.*?P",tf1).group().replace('P','').replace("Fee",'') print('Tender fee :-',tenderfee) #Authority name an=re.search(r'Authority.*?Address',text).group() authname=re.search(r'Name.*?Address',an).group().replace("Name",'').replace("Address",'') print("Authority name :-",authname) #Authority add ad=re.search(r'Authority.*?Back',text).group() authadd=re.search(r'Address.*?Back',ad).group().replace('Address','').replace('Back','') print("Authority Address :-",authadd) #location loc=re.search(r'Period.*?Pincode',text).group() location=re.search(r'Location.*?Pincode',loc).group().replace('Location','').replace('Pincode','') print("Tender Location :-",location) #Bid submission start date bsd=re.search(r'Submission.*?Bid',text).group() bidstart=re.search(r'Date.*?Bid',bsd).group().replace("Date",'').replace("Bid",'') print("Bis submission start date :-",bidstart) #Bid submission end date bed=re.search(r'Submission.*?Tender',text).group() bidend=re.search(r'End.*?Tender',bed).group().replace('End','').replace('Tender','').replace('Date','') print("Bid submission end date :-",bidend) #Bid opening date bod=re.search(r'Published.{130}',text).group() bod1=re.search(r'Bid.*?Document',bod).group() bidopen=re.search(r'Date.*?Document',bod1).group().replace('Date','').replace('Document','') print('Bid opening date :-',bidopen) #Tender Title tt=re.search(r'EMD.*?NDA',text).group() tt1=re.search(r'Title.*Work',tt).group() tt1=tt1.replace('Title','') tt1=tt1.replace('Work','') print("Tender Title :-",tt1) #Work Description #wdd=re.search(r'Fee.*?Qu',text).group() wd=re.search(r'EMD.*?NDA',text).group() workdes=re.search(r'Work.*?NDA',wd).group() workdes=workdes.replace('Work','') workdes=workdes.replace('NDA','') print("Work description :-",workdes) #Project State: prostate="Maharastra" print("Project state :-",prostate) #Project country: country="India" print("Tender Country",country) #product Catagory: pcat=re.search(r'Product.*?Sub',text).group() pcat=pcat.replace('Product','') pcat=pcat.replace('Category','') pcat=pcat.replace('Sub','') print("Product Catagory :-",pcat) #Document sale start date dsd=re.search(r'Sale.{100}',text).group() dsd1=re.search(r'Date.*?Document',dsd).group() dsd1=dsd1.replace('Date','') dsd1=dsd1.replace('Document','') print("Document sale start date :-",dsd1) #Document sale end date ded=re.search(r'Sale.{100}',text).group() ded1=re.search(r'End.*?Clarification',ded).group() ded2=re.search(r'Date.*?Clarification',ded1).group() ded2=ded2.replace('Date','') ded2=ded2.replace('Clarification','') print('Document sale end date :-',ded2) #Product name: #pn=re.search() product="N.A" condition="False" list=["Flowers","High Security Registration Plates","R.O.Plant","SOLAR STREET LIGHT AND SOLAR PUMP","White LED"] for i1 in list: if i1 in tt1: product=i1 condition="True" wb=op.load_workbook('tenderauto.xlsx') ws=wb.active ws.append(['Tender Notice NO','Tender type','Product category','Authority Name','Project State','EMD','Tender Value','Tender Country', 'Contact Email ID','Authority website','Tender Title','Tender Description','Bid Open Date','Phone no', 'Fax no','Document sale end date','Product name','Tender publish date','Tender document url']) wb.save(filename='tenderauto.xlsx') data=[tn1,tenderty,pcat,authname,prostate,ernest,tenderval,country,"",url,tt1,workdes,bidopen,"NA","NA",ded2,product,publishdate,kurl] #print(data) ws.append(data) wb.save(filename='tenderauto.xlsx')
def mine(self): #PDFMiner parser = PDFParser(self.fp) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed laparams = LAParams() rsrcmgr = PDFResourceManager() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = PDFPage.create_pages(document) index = 0 for page in pages: interpreter.process_page(page) layout = device.get_result() self.layouts.append(layout) self.page_num += 1 counter = {x: 0 for x in object_type} page_height = layout.bbox[3] page_width = layout.bbox[2] page_elem = {x: [] for x in elem_type} figure_groups = {} textBoxs = {'left': [], 'middle': [], 'right': [], 'all': []} line_groups = [] for x in layout: if (isinstance(x, LTTextBox)): textBoxs[positionClassifier(x, page_width)].append(x) textBoxs['all'].append(x) counter['TextBox'] += 1 elif (isinstance(x, LTFigure)): if (x.bbox[3] - x.bbox[1] < 5): continue #find figure title (the nearest textbox) #the numbers of figures and textboxs are not so big minn = 100000000 title = None posGroups = posGroups_settings[positionClassifier( x, page_width)] for group in posGroups: for y in textBoxs[group]: if (x.bbox[1] < y.bbox[3] or y.get_text()[0] == '(' or len(y.get_text()) < MIN_LENTH): continue dst = rectDistance(x, y) if (dst < minn): minn = dst title = y if (title == None): title = notFound() #merge the near textBoxs title_merged = mergedText(title, textBoxs['all']) title_text = title_merged.text if title_text in figure_groups: add_sub = figure(x) figure_groups[title_text].addFigure(add_sub) else: new_fig = figure(x) new_group = figureGroup(new_fig) new_group.setTitle(title_merged) figure_groups[title_text] = new_group page_elem['Figure'].append(new_group) counter['Figure'] += 1 elif (isinstance(x, LTLine)): #horizontal lines if (equal(x.bbox[1], x.bbox[3]) and x.bbox[2] - x.bbox[0] > page_width / 6): flag = False for lineGroup in line_groups: if (equal(lineGroup.bbox[0], x.bbox[0]) and equal(lineGroup.bbox[2], x.bbox[2])): lineGroup.addLine(x) flag = True break if (flag == False): new_group = Table(x) line_groups.append(new_group) counter['Line'] += 1 counter['Object'] += 1 for x in object_type: self.num[x] += counter[x] #find tables for table in line_groups: if (table.lineNum < 2 or table.bbox[2] - table.bbox[0] < 50): continue text = self.text_in_rect(table, textBoxs['all']) if (len(text) != 0): if (re.search("Algorithm", text[0].get_text())): page_elem['Algorithm'].append(table) continue #split lines into groups divided_text = [[] for i in range(1, len(table.lines))] for t in text: mid_y = midPoint(t)[1] for i in range(1, len(table.lines)): if (mid_y > table.lines[i][1]): divided_text[i - 1].append(t) break split_tables = [] prev_i = 0 for i in range(len(table.lines) - 1): if (len(divided_text[i]) == 0): continue if (len(divided_text[i]) == 1 and divided_text[i][0].bbox[2] - divided_text[i][0].bbox[0] > 1 / 4 * (table.bbox[2] - table.bbox[0])): #split the table new_table = Table() new_table.setLines(table.lines[prev_i:i + 1]) prev_i = i + 1 split_tables.append(new_table) #the last table new_table = Table() new_table.setLines(table.lines[prev_i:len(table.lines)]) split_tables.append(new_table) for split_table in split_tables: minn = 100000000 title = None posGroups = posGroups_settings[positionClassifier( split_table, page_width)] for group in posGroups: for y in textBoxs[group]: #the title is above the table if (split_table.bbox[3] > y.bbox[1]): continue dst = rectDistance(table, y) if (dst < minn): minn = dst title = y if (title == None): title = notFound() title_merged = mergedText(title, textBoxs['all']) split_table.setTitle(title_merged) page_elem['Table'].append(split_table) #assert index!=1 for x in elem_type: self.elem[x].append(page_elem[x]) index += 1
""" """ if isinstance(layout_obj, LTTextBox): return [layout_obj] if isinstance(layout_obj, LTContainer): boxes = [] for child in layout_obj: boxes.extend(find_textboxes_recursively(child)) return boxes return [] laparams = LAParams(detect_vertical=True) resource_manager = PDFResourceManager() device = PDFPageAggregator(resource_manager, laparams=laparams) interpreter = PDFPageInterpreter(resource_manager, device) with open(sys.argv[1], 'rb') as f: for page in PDFPage.get_pages(f): interpreter.process_page(page) layout = device.get_result() boxes = find_textboxes_recursively(layout) boxes.sort(key=lambda b: (-b.y1, b.x0)) for box in boxes: print('-' * 10) print(box.get_text().strip())
def main(argv): import getopt def usage(): print( 'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]' ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]' ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation] [-S]' ' [-t text|html|xml|tag] [-c codec] [-s scale]' ' file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:St:c:s:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 stripcontrol = False layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-R': rotation = int(v) elif k == '-S': stripcontrol = True elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() return
def __init__(self, file, merge_tags=('LTChar', 'LTAnno'), round_floats=True, round_digits=3, input_text_formatter=None, normalize_spaces=True, resort=True, parse_tree_cacher=None, laparams={ 'all_texts': True, 'detect_vertical': True }, password=''): # store input self.merge_tags = merge_tags self.round_floats = round_floats self.round_digits = round_digits self.resort = resort # set up input text formatting function, if any if input_text_formatter: self.input_text_formatter = input_text_formatter elif normalize_spaces: r = re.compile(r'\s+') self.input_text_formatter = lambda s: re.sub(r, ' ', s) else: self.input_text_formatter = None # open doc if not hasattr(file, 'read'): try: file = open(file, 'rb') except TypeError: raise TypeError("File must be file object or filepath string.") parser = PDFParser(file) if hasattr(QPDFDocument, 'set_parser'): # pdfminer < 20131022 doc = QPDFDocument() parser.set_document(doc) doc.set_parser(parser) else: # pdfminer >= 20131022 doc = QPDFDocument(parser, password) parser.set_document(doc) if hasattr(doc, 'initialize'): # as of pdfminer==20140328, "PDFDocument.initialize() method is # removed and no longer needed." doc.initialize() self.doc = doc self.parser = parser self.tree = None self.pq = None self.file = file if parse_tree_cacher: self._parse_tree_cacher = parse_tree_cacher self._parse_tree_cacher.set_hash_key(self.file) else: self._parse_tree_cacher = DummyCache() # set up layout parsing rsrcmgr = PDFResourceManager() if type(laparams) == dict: laparams = LAParams(**laparams) self.device = PDFPageAggregator(rsrcmgr, laparams=laparams) self.interpreter = PDFPageInterpreter(rsrcmgr, self.device) # caches self._pages = [] self._pages_iter = None self._elements = []
sys.path = [ os.path.realpath( os.path.join(os.path.dirname(os.path.realpath(__file__)), "./pdfminer")) ] + sys.path from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines from pdfminer.pdfpage import PDFPage from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTFigure, LTImage, LTChar fp = open("hb.pdf", "rb") #打开pdf文件 parser = PDFParser(fp) #用文件对象来创建一个pdf文档分类器 doc = PDFDocument(parser) #创建一个pdf文档 rs = PDFResourceManager() #创建pdf资源管理器来管理共享资源 #创建一个pdf设备对象 lapara = LAParams() device = PDFPageAggregator(rs, laparams=lapara) inte = PDFPageInterpreter(rs, device) #处理文档对象中每一页的内容 #doc.get_pages()获取page列表 #循环遍历列表,每次处理一个page的内容, #这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括 #LTTextBox, LTFigure, ,LTImage,LTTextBoxHorizontal 等等 想要获取文本就获得对象的txt属性 for page in PDFPage.create_pages(doc): inte.process_page(page) layout = device.get_result()