def dumpoutline(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): fp = open(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser, password) pages = dict((page.pageid, pageno) for (pageno, page) in enumerate(PDFPage.create_pages(doc), 1)) def resolve_dest(dest): if isinstance(dest, str): dest = resolve1(doc.get_dest(dest)) elif isinstance(dest, PSLiteral): dest = resolve1(doc.get_dest(dest.name)) if isinstance(dest, dict): dest = dest['D'] if isinstance(dest, PDFObjRef): dest = dest.resolve() return dest try: outlines = doc.get_outlines() outfp.write('<outlines>\n') for (level, title, dest, a, se) in outlines: pageno = None if dest: dest = resolve_dest(dest) pageno = pages[dest[0].objid] elif a: action = a if isinstance(action, dict): subtype = action.get('S') if subtype and repr(subtype) == '/\'GoTo\'' and action.get( 'D'): dest = resolve_dest(action['D']) pageno = pages[dest[0].objid] s = e(title).encode('utf-8', 'xmlcharrefreplace') outfp.write('<outline level="%r" title="%s">\n' % (level, s)) if dest is not None: outfp.write('<dest>') dumpxml(outfp, dest) outfp.write('</dest>\n') if pageno is not None: outfp.write('<pageno>%r</pageno>\n' % pageno) outfp.write('</outline>\n') outfp.write('</outlines>\n') except PDFNoOutlines: pass parser.close() fp.close() return
def get_ToC(file): """This funciton will locate the Table of Content, and return a dataframe with the corresponding ToC-number and name Args: file (the pdf-file that will be read): reads and extract the specific words from the string """ # Open a PDF document. fp = open(file, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) # Get the outlines of the document. outlines = document.get_outlines() for (level, title, dest, a, se) in outlines: print(level, title)