示例#1
0
def resource_example():
    from pdfminer.pdffont import CFFFont, TrueTypeFont
    from pdfminer.pdffont import PDFFont, PDFSimpleFont, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont
    from pdfminer.psparser import literal_name
    from pdfminer.pdftypes import PDFObjRef
    from pdfminer.pdftypes import list_value, dict_value, stream_value
    from pdfminer.pdfcolor import PDFColorSpace
    from pdfminer.pdfcolor import PREDEFINED_COLORSPACE

    font_filepath = '/path/to/font.ttf'
    with open(font_filepath, 'rb') as fp:
        #font = CFFFont(font_filepath, fp)
        font = TrueTypeFont(font_filepath, fp)
        print('Font type = {}.'.format(font.fonttype))
        print('Font fp = {}.'.format(font.fp))
        print('Font name = {}.'.format(font.name))
        print('Font tables = {}.'.format(font.tables))

    #--------------------
    pdf_filepath = '/path/to/sample.pdf'

    fp = None
    try:
        # Open a PDF file.
        fp = open(pdf_filepath, 'rb')

        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()

        pages = PDFPage.get_pages(
            fp, pagenos=None, maxpages=0, password=b''
        )  # pagenos uses zero-based indices. pagenos is sorted inside the function.
        page = next(pages)
        if page:
            resources, contents = page.resources, page.contents
            if not resources:
                print('No resource.')
                return

            if contents:
                print('Contents: {}.'.format(contents))
                #for ct in contents:
                #	print(ct.resolve())

            # REF [function] >> pdfminer.pdfinterp.PDFPageInterpreter.init_resources()
            def get_colorspace(spec):
                if isinstance(spec, list):
                    name = literal_name(spec[0])
                else:
                    name = literal_name(spec)
                if name == 'ICCBased' and isinstance(spec,
                                                     list) and 2 <= len(spec):
                    return PDFColorSpace(name, stream_value(spec[1])['N'])
                elif name == 'DeviceN' and isinstance(spec,
                                                      list) and 2 <= len(spec):
                    return PDFColorSpace(name, len(list_value(spec[1])))
                else:
                    return PREDEFINED_COLORSPACE[name]

            fontmap, xobjmap = dict(), dict()
            csmap = PREDEFINED_COLORSPACE.copy()
            for (k, v) in dict_value(resources).items():
                #if 2 <= self.debug:
                #	print >>stderr, 'Resource: %r: %r' % (k,v)
                if k == 'Font':
                    for (font_id, spec) in dict_value(v).items():
                        obj_id = None
                        if isinstance(spec, PDFObjRef):
                            obj_id = spec.objid
                        spec = dict_value(spec)
                        fontmap[font_id] = rsrcmgr.get_font(obj_id, spec)
                elif k == 'ColorSpace':
                    for (cs_id, spec) in dict_value(v).items():
                        csmap[cs_id] = get_colorspace(resolve1(spec))
                elif k == 'ProcSet':
                    rsrcmgr.get_procset(list_value(v))
                elif k == 'XObject':
                    for (xobj_id, xobjstrm) in dict_value(v).items():
                        xobjmap[xobj_id] = xobjstrm

            #spec = ...
            #if 'FontDescriptor' in spec:
            #	print('FontDescriptor: {}.'.format(spec['FontDescriptor'].resolve()))

            font = PDFType1Font(rsrcmgr, spec)
            font = PDFTrueTypeFont(rsrcmgr, spec)
            #font = PDFType3Font(rsrcmgr, spec)
            font = PDFCIDFont(rsrcmgr, spec)

            for font_id, font in fontmap.items():
                print(
                    '------------------------------------------------------------'
                )
                print('Descriptor: {}.'.format(font.descriptor))
                print('\tFont name: {}, Font type: {}.'.format(
                    font.fontname,
                    type(font).__name__))
                if hasattr(font, 'basefont'):
                    print('\tBase font: {}.'.format(font.basefont))
                if hasattr(font, 'flags'):
                    print('\tFlags = {}.'.format(font.flags))
                if hasattr(font, 'default_width') and hasattr(font, 'widths'):
                    print('\tDefault width = {}, Widths = {}.'.format(
                        font.default_width, font.widths))
                print('\tAscent: {}, {}.'.format(font.ascent,
                                                 font.get_ascent()))
                print('\tDescent: {}, {}.'.format(font.descent,
                                                  font.get_descent()))
                if hasattr(font, 'hscale') and hasattr(font, 'vscale'):
                    print('\tScale: {}, {}.'.format(font.hscale, font.vscale))
                if hasattr(font, 'leading') and hasattr(font, 'italic_angle'):
                    print('\tLeading = {}, Italic angle = {}.'.format(
                        font.leading, font.italic_angle))
                print('\tBbox = {}.'.format(font.bbox))
                if hasattr(font, 'get_width') and hasattr(font, 'get_height'):
                    print('\t(width, height) = ({}, {}).'.format(
                        font.get_width(), font.get_height()))
                if hasattr(font, 'is_multibyte') and hasattr(
                        font, 'is_vertical'):
                    print('\tis_multibyte = {}, is_vertical = {}.'.format(
                        font.is_multibyte(), font.is_vertical()))
                if hasattr(font, 'cid2unicode') and hasattr(
                        font, 'unicode_map'):
                    print('\tcid2unicode = {}, unicode_map = {}.'.format(
                        font.cid2unicode, font.unicode_map))
                #if hasattr(font, 'char_disp'):
                #	print('\tchar_disp({}) = {}.'.format(cid, font.char_disp(cid)))
                #if hasattr(font, 'to_unichr'):
                #	print('\tto_unichr({}) = {}.'.format(cid, font.to_unichr(cid)))
                #if hasattr(font, 'char_width') and hasattr(font, 'string_width'):
                #	print('\tchar_width({}) = {}, string_width({}) = {}.'.format(cid, font.char_width(cid), s, font.string_width(s)))
            for cs_id, cs in csmap.items():
                print('CS ID: {}.'.format(cs_id))
                print('\t{}.'.format(cs))
            for xobj_id, xobj in xobjmap.items():
                print('XObj ID: {}.'.format(xobj_id))
                print('\t{}.'.format(xobj))
    except FileNotFoundError as ex:
        print('File not found, {}: {}.'.format(pdf_filepath, ex))
    except Exception as ex:
        print('Unknown exception raised in {}: {}.'.format(pdf_filepath, ex))
    finally:
        if fp: fp.close()
示例#2
0
 def get_font(self, objid, spec):
     font = PDFResourceManager.get_font(self, objid, spec)
     # Correct broken fond - either it has an Encoding or a Unicode_map for text extraction
     if literal_name(spec['Encoding']) == 'WinAnsiEncoding':
         font.unicode_map = None
     return font
示例#3
0
 def get_font(self, objid, spec):
     font = PDFResourceManager.get_font(self, objid, spec)
     font.unicode_map = None
     return font