Python PDFDocument.get_outlines示例

编程语言: Python

命名空间/包名称: pdfminer3.pdfdocument

类/类型: PDFDocument

方法/功能: get_outlines

hotexamples.com的示例: 2

Python PDFDocument.get_outlines - 已找到2个示例。这些是从开源项目中提取的最受好评的pdfminer3.pdfdocument.PDFDocument.get_outlines现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

PDFDocument(15)

getobj(4)

get_outlines(2)

get_pages(1)

initialize(1)

set_parser(1)

示例#1

显示文件

def dumpoutline(outfp,
                fname,
                objids,
                pagenos,
                password='',
                dumpall=False,
                codec=None,
                extractdir=None):
    fp = open(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser, password)
    pages = dict((page.pageid, pageno)
                 for (pageno, page) in enumerate(PDFPage.create_pages(doc), 1))

    def resolve_dest(dest):
        if isinstance(dest, str):
            dest = resolve1(doc.get_dest(dest))
        elif isinstance(dest, PSLiteral):
            dest = resolve1(doc.get_dest(dest.name))
        if isinstance(dest, dict):
            dest = dest['D']
        if isinstance(dest, PDFObjRef):
            dest = dest.resolve()
        return dest

    try:
        outlines = doc.get_outlines()
        outfp.write('<outlines>\n')
        for (level, title, dest, a, se) in outlines:
            pageno = None
            if dest:
                dest = resolve_dest(dest)
                pageno = pages[dest[0].objid]
            elif a:
                action = a
                if isinstance(action, dict):
                    subtype = action.get('S')
                    if subtype and repr(subtype) == '/\'GoTo\'' and action.get(
                            'D'):
                        dest = resolve_dest(action['D'])
                        pageno = pages[dest[0].objid]
            s = e(title).encode('utf-8', 'xmlcharrefreplace')
            outfp.write('<outline level="%r" title="%s">\n' % (level, s))
            if dest is not None:
                outfp.write('<dest>')
                dumpxml(outfp, dest)
                outfp.write('</dest>\n')
            if pageno is not None:
                outfp.write('<pageno>%r</pageno>\n' % pageno)
            outfp.write('</outline>\n')
        outfp.write('</outlines>\n')
    except PDFNoOutlines:
        pass
    parser.close()
    fp.close()
    return

示例#2

显示文件

def get_ToC(file):
    """This funciton will locate the Table of Content, and return a dataframe with the corresponding ToC-number and name 

    Args:
        file (the pdf-file that will be read): reads and extract the specific words from the string
    """
    # Open a PDF document.
    fp = open(file, 'rb')
    parser = PDFParser(fp)
    document = PDFDocument(parser)

    # Get the outlines of the document.
    outlines = document.get_outlines()
    for (level, title, dest, a, se) in outlines:
        print(level, title)