def convert_pdf_to_txt(path_to_file): try: rsrcmgr = PDFResourceManager() retstr = io.StringIO() device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=LAParams()) fp = open(path_to_file, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) ToC_list = [] for i in doc.get_outlines(): ToC_list.append(i[1]) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos=set(), maxpages=0, password="", caching=True, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() print(text) except Exception as e: logging.error("Exception occurred", exc_info=True) return text, ToC_list
def get_outphs(path): from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage from pdfminer.psparser import PSLiteral from pdfminer.pdftypes import resolve1 fp = open(path, "rb") parser = PDFParser(fp) document = PDFDocument(parser) pages = dict((page.pageid, pageno) for (pageno,page) in enumerate(PDFPage.create_pages(document))) def resolve_dest(dest): if isinstance(dest, str): dest = resolve1(document.get_dest(dest)) elif isinstance(dest, PSLiteral): dest = resolve1(document.get_dest(dest.name)) if isinstance(dest, dict): dest = dest['D'] return dest toc = [] for (level, title, dest, a, structelem) in document.get_outlines(): pageno = None if dest: dest = resolve_dest(dest) pageno = pages[dest[0].objid] elif a: action = a if isinstance(action, dict): subtype = action.get('S') if subtype and repr(subtype) == '/GoTo' and action.get('D'): dest = resolve_dest(action['D']) pageno = pages[dest.resolve()[0].objid] toc.append({"level": level, "raw_title": title, "pageno": pageno + 1}) return toc
def table_of_contents_example(): pdf_filepath = '/path/to/sample.pdf' fp = None try: # Open a PDF file. fp = open(pdf_filepath, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. document = PDFDocument(parser, password=b'') # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed try: # Get the outlines of the document. outlines = document.get_outlines() for (level, title, dest, action, se) in outlines: print(level, title) except PDFNoOutlines as ex: print('No outline in {}: {}.'.format(pdf_filepath, ex)) except FileNotFoundError as ex: print('File not found, {}: {}.'.format(pdf_filepath, ex)) except Exception as ex: print('Unknown exception raised in {}: {}.'.format(pdf_filepath, ex)) finally: if fp: fp.close()
def purge_index(data, file): titles = [] datas = '' fp = open(file, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) # Get the outlines of the document. outlines = document.get_outlines() for (level, title, dest, a, se) in outlines: #titles.append(''.join([i for i in title if not i.isdigit() and i != '.']).strip()) titles.append(title.strip()) bc_text = ' '.join(data.split('\n')) #sentenceSplit = bc_text.split(".") for title in titles: if re.search(title, bc_text, re.IGNORECASE): bc_text = re.sub(title, '', bc_text, flags=re.IGNORECASE) return bc_text
def convert_pdf_to_txt(path_to_file): try: # rsrcmgr = PDFResourceManager() # retstr = io.StringIO() # device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=LAParams()) # fp = open(path_to_file, 'rb') # parser = PDFParser(fp) # doc = PDFDocument(parser) # ToC_list = [] # for i in doc.get_outlines(): # ToC_list.append(i[1]) # interpreter = PDFPageInterpreter(rsrcmgr, device) # for page in PDFPage.get_pages(fp, pagenos=set(), maxpages=0, password="",caching=True, check_extractable=True): # interpreter.process_page(page) # text = retstr.getvalue() # fp.close() # device.close() # retstr.close() # Filtered_ToC_list = [re.sub("^[0-9]+", "", i).strip() for i in ToC_list] parsed_txt = parser.fromfile(path_to_file) text = parsed_txt["content"] fp = open(path, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) ToC_list = [i[1] for i in doc.get_outlines()] Filtered_ToC_list = [ re.sub("^[0-9]+", "", i).strip() for i in ToC_list ] except Exception as e: logging.error("Exception occurred", exc_info=True) return text, Filtered_ToC_list
def parse_paragraphs(self, text): # Will only work for markdown elements # divided by '##' markers # or for pdf like chapters, e.g. \n\n 2 Conclusion \n\n lines = text.split('\n') headlines = [] if self.is_pdf: with open(self.paper_filename, 'rb') as pdf: parser = PDFParser(pdf) document = PDFDocument(parser) try: outlines = document.get_outlines() for (level, title, _, _, _) in outlines: if level == 1: headlines.append(title) except PDFNoOutlines: logging.info( "No outline found -> skipping paragraph search..." ) else: # check markdown headlines for index, line in enumerate(lines): if line.startswith('## '): headlines.append(line) if len(headlines) > 0: self.count_paragraphs(text, lines, headlines)
def main(): # Open a PDF file. with open('/home/chris/Documents/Literature/DFT Primer.pdf', 'rb') as fp: # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() print rsrcmgr # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.create_pages(document): print interpreter.process_page(page) outlines = document.get_outlines() for (level,title,dest,a,se) in outlines: print (level, title) return 0
def dumpoutline( outfp, fname, objids, pagenos, password="", dumpall=False, codec=None, extractdir=None, ): fp = open(fname, "rb") parser = PDFParser(fp) doc = PDFDocument(parser, password) pages = { page.pageid: pageno for (pageno, page) in enumerate(PDFPage.create_pages(doc), 1) } def resolve_dest(dest): if isinstance(dest, str): dest = resolve1(doc.get_dest(dest)) elif isinstance(dest, PSLiteral): dest = resolve1(doc.get_dest(dest.name)) if isinstance(dest, dict): dest = dest["D"] if isinstance(dest, PDFObjRef): dest = dest.resolve() return dest try: outlines = doc.get_outlines() outfp.write("<outlines>\n") for (level, title, dest, a, se) in outlines: pageno = None if dest: dest = resolve_dest(dest) pageno = pages[dest[0].objid] elif a: action = a if isinstance(action, dict): subtype = action.get("S") if subtype and repr(subtype) == "/'GoTo'" and action.get("D"): dest = resolve_dest(action["D"]) pageno = pages[dest[0].objid] s = e(title).encode("utf-8", "xmlcharrefreplace") outfp.write('<outline level="{!r}" title="{}">\n'.format(level, s)) if dest is not None: outfp.write("<dest>") dumpxml(outfp, dest) outfp.write("</dest>\n") if pageno is not None: outfp.write("<pageno>%r</pageno>\n" % pageno) outfp.write("</outline>\n") outfp.write("</outlines>\n") except PDFNoOutlines: pass parser.close() fp.close() return
def dumpoutline(outfp: TextIO, fname: str, objids: Any, pagenos: Container[int], password: str = '', dumpall: bool = False, codec: Optional[str] = None, extractdir: Optional[str] = None) -> None: fp = open(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser, password) pages = { page.pageid: pageno for (pageno, page) in enumerate(PDFPage.create_pages(doc), 1) } def resolve_dest(dest: object) -> Any: if isinstance(dest, (str, bytes)): dest = resolve1(doc.get_dest(dest)) elif isinstance(dest, PSLiteral): dest = resolve1(doc.get_dest(dest.name)) if isinstance(dest, dict): dest = dest['D'] if isinstance(dest, PDFObjRef): dest = dest.resolve() return dest try: outlines = doc.get_outlines() outfp.write('<outlines>\n') for (level, title, dest, a, se) in outlines: pageno = None if dest: dest = resolve_dest(dest) pageno = pages[dest[0].objid] elif a: action = a if isinstance(action, dict): subtype = action.get('S') if subtype and repr(subtype) == '/\'GoTo\'' and action.get( 'D'): dest = resolve_dest(action['D']) pageno = pages[dest[0].objid] s = escape(title) outfp.write('<outline level="{!r}" title="{}">\n'.format(level, s)) if dest is not None: outfp.write('<dest>') dumpxml(outfp, dest) outfp.write('</dest>\n') if pageno is not None: outfp.write('<pageno>%r</pageno>\n' % pageno) outfp.write('</outline>\n') outfp.write('</outlines>\n') except PDFNoOutlines: pass parser.close() fp.close() return
def extract_outline(pdf_path): fp = open(pdf_path, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) # Get the outlines of the document. outlines = document.get_outlines() #print(list(outlines)) result = [(level, title) for (level, title, dest, a, se) in outlines] return result
def dumpoutline(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): fp = open(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser, password) pages = dict((page.pageid, pageno) for (pageno, page) in enumerate(PDFPage.create_pages(doc), 1)) def resolve_dest(dest): if isinstance(dest, str): dest = resolve1(doc.get_dest(dest)) elif isinstance(dest, PSLiteral): dest = resolve1(doc.get_dest(dest.name)) if isinstance(dest, dict): dest = dest['D'] if isinstance(dest, PDFObjRef): dest = dest.resolve() return dest try: outlines = doc.get_outlines() outfp.write('<outlines>\n') for (level, title, dest, a, se) in outlines: pageno = None if dest: dest = resolve_dest(dest) pageno = pages[dest[0].objid] elif a: action = a if isinstance(action, dict): subtype = action.get('S') if subtype and repr(subtype) == '/\'GoTo\'' and action.get( 'D'): dest = resolve_dest(action['D']) pageno = pages[dest[0].objid] s = e(title).encode('utf-8', 'xmlcharrefreplace') outfp.write('<outline level="%r" title="%s">\n' % (level, s)) if dest is not None: outfp.write('<dest>') dumpxml(outfp, dest) outfp.write('</dest>\n') if pageno is not None: outfp.write('<pageno>%r</pageno>\n' % pageno) outfp.write('</outline>\n') outfp.write('</outlines>\n') except PDFNoOutlines: pass parser.close() fp.close() return
def read_outline(f): parser = PDFParser(f) document = PDFDocument(parser, None) if document.catalog.get('Outlines') is not None: #← アウトラインの有無を確認 outlines = document.get_outlines() for level, title, dest, a, se in outlines: print(f'階層: {level}, タイトル: {title}') else: print(f'PDF文書にアウトラインはありません')
def get_toc(pdf_path): infile = open(pdf_path, "rb") parser = PDFParser(infile) document = PDFDocument(parser) toc = list() for (level, title, dest, a, structelem) in document.get_outlines(): toc.append((level, title)) return toc
def get_toc(pdf_path): infile = open(pdf_path, 'rb') parser = PDFParser(infile) document = PDFDocument(parser) toc = list() for (level, title, dest, a, structelem) in document.get_outlines(): toc.append(level) toc.append(title) return toc
def parse(filename, maxlevel): fp = open(filename, "rb") parser = PDFParser(fp) doc = PDFDocument(parser) outlines = doc.get_outlines() for (level, title, dest, a, se) in outlines: if level <= maxlevel: title_words = title.encode("utf8").replace("\n", "").split() title = " ".join(title_words) print("<h{level}>{title}</h{level}>".format(level=level, title=title))
def read_outlines(fp): parser = PDFParser(fp) document = PDFDocument(parser) # 获得文档的目录(纲要) try: outlines = document.get_outlines() for (level, title, dest, a, se) in outlines: print(level, title) except: print("文档不存在大纲!")
def get_toc(self, pdf_path): infile = open(pdf_path, 'rb') parser = PDFParser(infile) document = PDFDocument(parser) toc = list() try: for (level, title, dest, a, structelem) in document.get_outlines(): toc.append((level, title)) return toc except Exception: return False
def get_pdf_file_structure(path_to_pdf): fp = open(path_to_pdf, 'rb') parser = PDFParser(fp) document = PDFDocument(parser, password="") outlines = document.get_outlines() chapters = list() for (level, title, dest, a, se) in outlines: chapters.append(title) fp.close() return (chapters)
def get_toc(pdf_path): infile = open(pdf_path, 'rb') parser = PDFParser(infile) document = PDFDocument(parser) toc = list() try: for (level, title, dest, a, structelem) in document.get_outlines(): print remove_non_ascii(title.strip()) toc = '-' except PDFNoOutlines: pass return toc
def extract_outlines(path): from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument # Open a PDF document. fp = open(path, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) # Get the outlines of the document. outlines = document.get_outlines() for (level, title, dest, a, se) in outlines: print(level, title)
def parse(path): fp = open(path, 'rb') # 以二进制读模式打开 #用文件对象来创建一个pdf文档分析器 praser = PDFParser(fp) # 创建一个PDF文档 doc = PDFDocument(praser) # 连接分析器 与文档对象 praser.set_document(doc) #doc.set_parser(praser) outlines = doc.get_outlines() for (level, title, dest, a, se) in outlines: print(level, json.dumps(title, ensure_ascii=False))
def via_toc(path): try: titles = [] infile = open(path, 'rb') parser = PDFParser(infile) document = PDFDocument(parser) toc = list() title = [(level, title) for (level, title, dest, a, structelem) in document.get_outlines()][0][1] return {"/Title": title, "/Author": ""} except Exception, e: return None
def extract_from_awspdf(url): pdf_file = get_or_dl_pdf(url) print("Analyzing pdf... ", end='', flush=True), document = PDFDocument(PDFParser(pdf_file)) outlines = document.get_outlines() current_section = "" actions = [] for level, title, *_ in outlines: if level == 1: current_section = title if current_section == "Actions" and level == 2: actions.append(title) return actions
def extract_TOC(pdf_path): fp = open(pdf_path, 'rb') parser = PDFParser(fp) document = PDFDocument(parser, b"") pages = {page.pageid: pageno for (pageno, page) in enumerate(PDFPage.create_pages(document), 1)} def resolve_dest(dest): if isinstance(dest, str): dest = resolve1(document.get_dest(dest)) elif isinstance(dest, PSLiteral): dest = resolve1(document.get_dest(dest.name)) if isinstance(dest, dict): dest = dest['D'] if isinstance(dest, PDFObjRef): dest = dest.resolve() return dest toc = "" try: outlines = document.get_outlines() toc += '<outlines>\n' for (level, title, dest, a, se) in tqdm(outlines, leave=False): pageno = None if dest: dest = resolve_dest(dest) # Very imperative and can cause errors that are hard to debug since we overwrite pageno = pages[dest[0].objid] elif a: action = a if isinstance(action, dict): subtype = action.get("S") if subtype and repr(subtype) == "/'GoTo'" and action.get("D"): dest = resolve_dest(action.get("D")) pageno = pages[dest[0].objid] string = escape_str(title).encode("utf-8", "xmlcharrefreplace") toc += '<outline level="{!r}" title="{}">\n'.format(level, string) if dest is not None: toc += "<dest>" toc = dumpxml(toc, dest) toc += "</dest>\n" if pageno is not None: toc += "<pageno>{}</pageno>\n".format(pageno) toc += "</outline>\n" toc += "</outlines>\n" except PDFNoOutlines: pass parser.close() fp.close() return toc
def parse(filename, maxlevel): fp = open(filename, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) outlines = doc.get_outlines() for (level, title, dest, a, se) in outlines: if level <= maxlevel: title_words = title.encode('utf8') \ .replace('\n', '') \ .split() title = ' '.join(title_words) print('<h{level}>{title}</h{level}>' .format(level=level, title=title))
def process_text(): global count,ch_map,sec_map,para_map,book_trie,book fp = open('Algorithms.pdf', 'rb') parser = PDFParser(fp) document = PDFDocument(parser, "secret") outlines = document.get_outlines() #get index contexts i=j=0 for (level,title,dest,a,se) in outlines: if level==1: if i<9: i+=1 chapters[i]=unicodedata.normalize('NFKD', title).encode('ascii','ignore') else: if j< 44: j+=1 sections[j]=unicodedata.normalize('NFKD', title).encode('ascii','ignore') with open('Algorithms.txt','rb') as f: book=f.read() a=0 for j in chapters: ch_map[j]=book.index(chapters[j]) a=ch_map[j] a=0 for j in sections: sec_map[j]=book.index(sections[j]) a=sec_map[j] i=1 a=0 para_map[i]=a i+=1 while(a<len(book)): try: a=book.index('\n\n',a+1) except: break para_map[i]=a+2 i+=1 count[1]=len(ch_map) count[2]=len(sec_map) count[3]=len(para_map) book_trie=make_trie()
def get_pageno(pdf_file): logging.debug('get_pageno in...' + pdf_file) with open(pdf_file, 'rb') as fp: parser = PDFParser(fp) doc = PDFDocument(parser) pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(PDFPage.create_pages(doc)) ) # Get the outlines of the document. outlines = doc.get_outlines() for (level,title,dest,a,se) in outlines: pageno = pages[dest[0].objid] # III. 재무에 관한 사항 페이지 찾기 if title.startswith((u'III', u'Ⅲ')): return pageno
def succ_test(): try: os.chdir(r'F:\allitebooks\making-games') fp = open('Making Games.pdf', 'rb') parser = PDFParser(fp) document = PDFDocument(parser) print "extractable:", document.is_extractable, ",modifiable:", document.is_modifiable, ", printable:", document.is_printable outlines = document.get_outlines() print outlines except: traceback.print_exc() finally: parser.close() fp.close()
def getDocTitle(filename): from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument # Open a PDF document. fp = open(filename, 'rb') parser = PDFParser(fp) document = PDFDocument(parser, '') #定义一个存储输出结果的变量 tit_res = [] # Get the outlines of the document. outlines = document.get_outlines() for (level,title,dest,a,se) in outlines: tit_res.append([level, title]) return tit_res
def parse(filename, maxlevel): fp = open(filename, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) outlines = doc.get_outlines() for (level, title, dest, a, se) in outlines: if level <= maxlevel: title_words = title.encode('utf8') \ .replace('\n', '') \ .split() title = ' '.join(title_words) print('<h{level}>{title}</h{level}>'.format(level=level, title=title))
def convertPDFFilter(self, path): if not os.path.exists(path): return False fp = open(path, 'rb') ri = self.reinit() retstr = ri['retstr'] device = ri['device'] interpreter = ri['interpreter'] parser = PDFParser(fp) document = PDFDocument(parser, self.password) try: outlines = document.get_outlines() for (level,title,dest,a,se) in outlines: self.titles.append(str(level) + ' ' + title) #print (level, title) except PDFNoOutlines: self.titles = [] #metadata = document.info #print(metadata) #for x in metadata: # if x == "Title": # print(x) i = 0 for page in PDFPage.get_pages(fp): print(i) if i > 20 and i < 40: i+=1 continue i+=1 interpreter.process_page(page) layout = device.get_result() ptxt = '' for e in layout: if isinstance(e, LTTextBoxHorizontal): #print(element.get_text()) ptxt += e.get_text() self.pages.append(ptxt) fp.close() device.close() retstr.close() return True
def dumpoutline(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): fp = file(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) doc.initialize(password) pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(PDFPage.create_pages(doc)) ) def resolve_dest(dest): if isinstance(dest, str): dest = resolve1(doc.get_dest(dest)) elif isinstance(dest, PSLiteral): dest = resolve1(doc.get_dest(dest.name)) if isinstance(dest, dict): dest = dest['D'] return dest try: outlines = doc.get_outlines() outfp.write('<outlines>\n') for (level,title,dest,a,se) in outlines: pageno = None if dest: dest = resolve_dest(dest) pageno = pages[dest[0].objid] elif a: action = a.resolve() if isinstance(action, dict): subtype = action.get('S') if subtype and repr(subtype) == '/GoTo' and action.get('D'): dest = resolve_dest(action['D']) pageno = pages[dest[0].objid] s = e(title).encode('utf-8', 'xmlcharrefreplace') outfp.write('<outline level="%r" title="%s">\n' % (level, s)) if dest is not None: outfp.write('<dest>') dumpxml(outfp, dest) outfp.write('</dest>\n') if pageno is not None: outfp.write('<pageno>%r</pageno>\n' % pageno) outfp.write('</outline>\n') outfp.write('</outlines>\n') except PDFNoOutlines: pass parser.close() fp.close() return
def display_pageno(pdffile): result = [] fp = open(pdffile, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) pages = dict((page.pageid, pageno) for (pageno, page) in enumerate(PDFPage.create_pages(doc), 1)) def resolve_dest(dest): if isinstance(dest, str) or isinstance(dest, bytes): dest = resolve1(doc.get_dest(dest)) elif isinstance(dest, PSLiteral): dest = resolve1(doc.get_dest(dest.name)) if isinstance(dest, dict): dest = dest['D'] if isinstance(dest, PDFObjRef): dest = dest.resolve() return dest outlines = doc.get_outlines() for (level, title, dest, a, se) in outlines: pageno = None pageid = None if dest: dest = resolve_dest(dest) pageid = dest[0].objid pageno = pages[pageid] elif a: action = a if isinstance(action, dict): subtype = action.get('S') if subtype and repr(subtype) == '/\'GoTo\'' and action.get( 'D'): dest = resolve_dest(action['D']) pageid = dest[0].objid pageno = pages[pageid] # print (level, title, pageno, pageid) result.append({ "level": level, "title": title, "pageno": pageno, "pageid": pageid }) return result
def __get_outlines_pdf(self, book_name): """Get the titles and pages that this titles link to. If there's no destination (link to text from title it'll not be possible to find out the title of the page)""" fp = open(book_name, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) try: outlines = document.get_outlines() for (level, title, dest, a, se) in outlines: if not dest: break self.destination.append(dest[0].objid) self.titles.append(title) except (PDFNoOutlines, TypeError): pass fp.close()
def get_toc(pdf_path): infile = open(pdf_path, 'rb') parser = PDFParser(infile) document = PDFDocument(parser) toc = list() for (level, title, dest, ref, structelem) in document.get_outlines(): resolved_ref = ref.resolve() stringified_ref = { key: str(resolved_ref[key]) for key in resolved_ref } toc.append({ 'level': level, 'title': title, 'ref': stringified_ref }) return toc, document
def extract_contents(self): parser = PDFParser(self.fd) doc = PDFDocument(parser) self.total_pages = self.get_pages_total() self.pages = zip(PDFPage.get_pages(self.fd), range(1, self.total_pages)) try: outlines = doc.get_outlines() except PDFNoOutlines: # No built-in outlines return None else: # built-in outlines exist def search_page_toc(objid): for page, pagenum in self.pages: if page.pageid == objid: return pagenum return 0 for (level, title, dest, a, se) in outlines: if dest is not None: pn = search_page_toc(dest[0].objid) if pn > 0: self.outlines.append((title, pn))
def get_headings(filename): os.chdir('..') rd.open_location("/PDF",True) filename_=filename[:-14] for compare_filename in os.listdir(os.getcwd()): if filename_ == compare_filename[:-4]: in_file=open(compare_filename, 'rb') parse_file=PDFParser(in_file) file=PDFDocument(parse_file) pages=0 for page in PDFPage.get_pages(in_file): pages+=1 headings_list=[] try: for (level,title,dest,a,structelem) in file.get_outlines(): headings_list.append((level,title)) rd.open_location("/program",True) return headings_list,pages except: rd.open_location("/program",True) return None,pages
def valid_toc(self, toc): with open(str(self._doc), "rb") as pdffile: parser = PDFParser(pdffile) document = PDFDocument(parser) try: real_toc = list(document.get_outlines()) except PDFNoOutlines: return len(toc) == 0 print("TOC from PDF file:", real_toc) if len(real_toc) != len(toc): print("Incorrect TOC length") return False for ref, real in zip(toc, real_toc): print("Checking", ref) if not ref[0] + 1 == real[0]: # level return False if not self._is_reference_to_ith_page(real[2][0], ref[1] - 1): # destination return False if not ref[2] == real[1]: # title return False return True
from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument # Open a PDF document. fp = open('mypdf.pdf', 'rb') parser = PDFParser(fp) document = PDFDocument(parser) # Get the outlines of the document. outlines = document.get_outlines() for (street_name, type , dir, address_range, city, elementary, middle, high_school) in outlines: print (level, title)
def createFromPdfminer(filename): from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines from pdfminer.pdfpage import PDFPage from pdfminer.pdftypes import PDFObjRef fp = open(filename, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) doc.initialize() assert doc.is_extractable result = PDFInfos() result._metaInfo = dict((key, str.decode(value, 'utf-16') if value.startswith('\xfe\xff') else value) for key, value in doc.info[0].items() if isinstance(value, basestring)) pageids = [page.pageid for page in PDFPage.create_pages(doc)] result._pageCount = len(pageids) def get(obj, attr = None): """Resolve PDFObjRefs, otherwise a no-op. May also perform dict lookup, i.e. get(obj, 'A') is roughly the same as get(obj)['A'].""" while isinstance(obj, PDFObjRef): obj = obj.resolve() if attr is not None: return get(obj[attr]) return obj def actionToPageIndex(action): assert get(action, 'S').name == 'GoTo' name = get(action, 'D') # resolve "named destination": dest = get(doc.get_dest(name)) return destToPageIndex(dest) def destToPageIndex(dest): dest = get(dest) if isinstance(dest, dict): assert dest.keys() == ['D'], repr(dest) dest = get(dest, 'D') # destinations contain the page as first element, # the rest concerns the ROI / zoom state (various modes there): return pageids.index(dest[0].objid) try: result._outline = [(level, title, actionToPageIndex(a) if a else destToPageIndex(dest)) for level, title, dest, a, se in doc.get_outlines()] except PDFNoOutlines: result._outline = None result._pageInfos = [] # get annotations (links): for page in PDFPage.create_pages(doc): pageLinks = [] for anno in get(page.annots) or []: anno = get(anno) rect = numpy.array(get(anno, 'Rect'), float).reshape((2, 2)) if 'Dest' in anno: # 'Dest' is the older (more compatible) way to # specify links dest = get(anno, 'Dest') pageLinks.append((rect, destToPageIndex(dest))) elif 'A' in anno: # actions are much more general and include 'GoTo' # (with viewport spec.) with variants for remote # and embedded documents action = get(anno, 'A') subType = get(action, 'S').name if subType == 'GoTo': pageLinks.append((rect, actionToPageIndex(action))) elif subType == 'URI': #assert sorted(action.keys()) == ['S', 'Type', 'URI'] link = get(action, 'URI') if link.startswith('file:'): # resolve relative pathname w.r.t. PDF filename: link = 'file:' + os.path.join(os.path.dirname(filename), link[5:]) pageLinks.append((rect, link)) pageBox = numpy.array([page.mediabox], float).reshape((2, 2)) result._pageInfos.append(PDFPageInfos(links = pageLinks, pageBox = pageBox)) # extract all named destinations: def extract_names(dests, result = None): if result is None: result = {} if 'Names' in dests: it = iter(get(dests, 'Names')) for name, ref in zip(it, it): result[name] = destToPageIndex(ref) if 'Kids' in dests: for kid in get(dests, 'Kids'): extract_names(get(kid), result) return result try: result._names = extract_names(get(doc.catalog['Names'], 'Dests')) except KeyError: pass return result
def pdf_cover(self, pdf, images): '''Attempt to use embedded outline information in the PDF to determine which image to use as the cover or primary image for the volume. :param pdf: path to the pdf file for this volume :param images: list of image file paths for this volume ''' with open(pdf, 'rb') as pdf_file: parser = PDFParser(pdf_file) document = PDFDocument(parser) try: outlines = document.get_outlines() logger.debug('PDF %s includes outline information, using for cover identification', pdf) except PDFNoOutlines: logger.debug('PDF %s does not include outline information', pdf) return None # generate a dictionary of page object id and zero-based page number pages = dict((page.pageid, pageno) for (pageno, page) in enumerate(PDFPage.create_pages(document))) possible_coverpages = [] page_count = 0 for (level, title, dest, a, se) in outlines: # NOTE: some LSDI PDFs trigger a maximum recursion error in # pdfminer; try to avoid this by bailing out after processing # a set number of outline items # caveat: outline entries are not necessarily returned in order page_count += 1 if page_count > 15: break # title is the label of the outline element # dest is the target page object; apparently in some cases this can be None ? # if so, skip it if dest is None: continue # we can probably use either Cover or Title Page; there # may be multiple Covers (for back cover) if title.lower() in ['cover', 'title page']: # determine page number for the reference page_num = pages[dest[0].objid] # check if the page is blank, as seems to be happening in some # cases for what is labeled as the cover try: img = images[page_num] except IndexError: logger.error('Not enough images for requested page number %s', page_num) continue if self.is_blank_page(img): logger.debug('PDF outline places %s at page %s but it is blank', title, page_num) # do NOT include as a possible cover page else: # non-blank: include as possible cover page logger.debug('PDF outline places %s at page %s', title, page_num) possible_coverpages.append(page_num) if possible_coverpages: # for now, just return the lowest page number, which should be # the first cover or title page if cover is blank return sorted(possible_coverpages)[0]