def getPdfLayout(filename): # ファイルを開く fp = open(filename, 'rb') # parser生成 parser = PDFParser(fp) # ドキュメントオブジェクト生成 document = PDFDocument() # parserにドキュメントを設定 parser.set_document(document) document.set_parser(parser) # PDFリソース管理オブジェクト生成 manager = PDFResourceManager() rettxt = StringIO() # 解析のパラメータオブジェくを生成 laparams = LAParams() # アグリゲーターの生成 device = TextConverter(manager, rettxt, code='utf-8', laparams=laparams) # インタプリタの生成 interpreter = PDFPageInterpreter(manager, device) # PDFページを取得 pages = list(document.get_pages()) page1 = pages[0] interpreter.process_page(page1) layout = device.get_result() for l in layout: print(l.get_text())
def convert_pdf_to_txt(path, pgno): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() fstr = '' device = PDFPageAggregator(rsrcmgr, laparams=laparams) for pageNumber, page in enumerate(PDFPage.get_pages(fp)): if pageNumber == pgno: interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() for element in layout: if isinstance(element, LTTextBoxHorizontal): return element.get_text() '''str = retstr.getvalue() fstr += str''' '''for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) str = retstr.getvalue() fstr += str''' '''fp.close()
def text_extraction_example(): import io pdf_filepath = '/path/to/sample.pdf' fp = None try: # Open a PDF file. fp = open(pdf_filepath, 'rb') # Create resource manager. rsrcmgr = PDFResourceManager() # Set parameters for layout analysis. laparams = LAParams( line_overlap= 0.5, # If two characters have more overlap than this they are considered to be on the same line. char_margin= 2.0, # If two characters are closer together than this margin they are considered part of the same line. word_margin= 0.1, # If two characters on the same line are further apart than this margin then they are considered to be two separate words, and an intermediate space will be added for readability. line_margin= 0.5, # If two lines are close together they are considered to be part of the same paragraph. boxes_flow= 0.5, # Specifies how much a horizontal and vertical position of a text matters when determining the order of text boxes. detect_vertical= False, # If vertical text should be considered during layout analysis. all_texts= False # If layout analysis should be performed on text in figures. ) if True: retstr = io.StringIO() device = TextConverter(rsrcmgr, retstr, pageno=1, laparams=laparams, showpageno=False, imagewriter=None) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages( fp, pagenos=None, maxpages=0, password=b'' ): # pagenos uses zero-based indices. pagenos is sorted inside the function. interpreter.process_page(page) texts = retstr.getvalue() # All texts in a page. #texts = texts.splitlines() #texts = list(txt for txt in texts.splitlines() if txt) print('------------------------------') print(texts) else: device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) page_no = 1 pages = PDFPage.get_pages( fp, pagenos=[page_no], maxpages=0, password=b'' ) # pagenos uses zero-based indices. pagenos is sorted inside the function. page = next(pages) interpreter.process_page(page) layout = device.get_result() print('Page bounding box = {}.'.format(layout.bbox)) bbox_text_pairs = extract_text_object(layout, pdf_filepath, page_no) for idx, (bbox, txt) in enumerate(bbox_text_pairs): print( '------------------------------ Block {} {} in page {} in {}.' .format(idx, bbox, page_no, pdf_filepath)) #print('------------------------------ Block {} {} in page {} in {}.'.format(idx, (bbox[0], layout.bbox[3] - bbox[3], bbox[2], layout.bbox[3] - bbox[1]), page_no, pdf_filepath)) print(txt) except FileNotFoundError as ex: print('File not found, {}: {}.'.format(pdf_filepath, ex)) except Exception as ex: print('Unknown exception raised in {}: {}.'.format(pdf_filepath, ex)) finally: if fp: fp.close()
def extract_text( files=[], outfile='-', _py2_no_more_posargs=None, # Bloody Python2 needs a shim no_laparams=False, all_texts=None, detect_vertical=None, # LAParams word_margin=None, char_margin=None, line_margin=None, boxes_flow=None, # LAParams output_type='text', codec='utf-8', strip_control=False, maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0, layoutmode='normal', output_dir=None, debug=False, disable_caching=False, **other): if _py2_no_more_posargs is not None: raise ValueError("Too many positional arguments passed.") if not files: raise ValueError("Must provide files to work upon!") # If any LAParams group arguments were passed, create an LAParams object and # populate with given args. Otherwise, set it to None. if not no_laparams: laparams = pdfminer.layout.LAParams() for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"): paramv = locals().get(param, None) if paramv is not None: setattr(laparams, param, paramv) else: laparams = None imagewriter = None if output_dir: imagewriter = ImageWriter(output_dir) if output_type == "text" and outfile != "-": for override, alttype in ((".htm", "html"), (".html", "html"), (".xml", "xml"), (".tag", "tag")): if outfile.endswith(override): output_type = alttype if outfile == "-": outfp = sys.stdout if outfp.encoding is not None: codec = 'utf-8' else: outfp = open(outfile, "wb") for fname in files: with open(fname, "rb") as infp: # pdfminer.high_level.extract_text_to_fp(fp, **locals()) rsrcmgr = PDFResourceManager_new(caching=not disable_caching) device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) if outfp == sys.stdout: outfp = sys.stdout.buffer interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(infp, page_numbers, maxpages=maxpages, password=password, caching=not disable_caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(infp): interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() for element in layout: if isinstance(element, LTTextBoxHorizontal): print(element.get_text()) return outfp