def resource_example(): from pdfminer.pdffont import CFFFont, TrueTypeFont from pdfminer.pdffont import PDFFont, PDFSimpleFont, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont from pdfminer.psparser import literal_name from pdfminer.pdftypes import PDFObjRef from pdfminer.pdftypes import list_value, dict_value, stream_value from pdfminer.pdfcolor import PDFColorSpace from pdfminer.pdfcolor import PREDEFINED_COLORSPACE font_filepath = '/path/to/font.ttf' with open(font_filepath, 'rb') as fp: #font = CFFFont(font_filepath, fp) font = TrueTypeFont(font_filepath, fp) print('Font type = {}.'.format(font.fonttype)) print('Font fp = {}.'.format(font.fp)) print('Font name = {}.'.format(font.name)) print('Font tables = {}.'.format(font.tables)) #-------------------- pdf_filepath = '/path/to/sample.pdf' fp = None try: # Open a PDF file. fp = open(pdf_filepath, 'rb') # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() pages = PDFPage.get_pages( fp, pagenos=None, maxpages=0, password=b'' ) # pagenos uses zero-based indices. pagenos is sorted inside the function. page = next(pages) if page: resources, contents = page.resources, page.contents if not resources: print('No resource.') return if contents: print('Contents: {}.'.format(contents)) #for ct in contents: # print(ct.resolve()) # REF [function] >> pdfminer.pdfinterp.PDFPageInterpreter.init_resources() def get_colorspace(spec): if isinstance(spec, list): name = literal_name(spec[0]) else: name = literal_name(spec) if name == 'ICCBased' and isinstance(spec, list) and 2 <= len(spec): return PDFColorSpace(name, stream_value(spec[1])['N']) elif name == 'DeviceN' and isinstance(spec, list) and 2 <= len(spec): return PDFColorSpace(name, len(list_value(spec[1]))) else: return PREDEFINED_COLORSPACE[name] fontmap, xobjmap = dict(), dict() csmap = PREDEFINED_COLORSPACE.copy() for (k, v) in dict_value(resources).items(): #if 2 <= self.debug: # print >>stderr, 'Resource: %r: %r' % (k,v) if k == 'Font': for (font_id, spec) in dict_value(v).items(): obj_id = None if isinstance(spec, PDFObjRef): obj_id = spec.objid spec = dict_value(spec) fontmap[font_id] = rsrcmgr.get_font(obj_id, spec) elif k == 'ColorSpace': for (cs_id, spec) in dict_value(v).items(): csmap[cs_id] = get_colorspace(resolve1(spec)) elif k == 'ProcSet': rsrcmgr.get_procset(list_value(v)) elif k == 'XObject': for (xobj_id, xobjstrm) in dict_value(v).items(): xobjmap[xobj_id] = xobjstrm #spec = ... #if 'FontDescriptor' in spec: # print('FontDescriptor: {}.'.format(spec['FontDescriptor'].resolve())) font = PDFType1Font(rsrcmgr, spec) font = PDFTrueTypeFont(rsrcmgr, spec) #font = PDFType3Font(rsrcmgr, spec) font = PDFCIDFont(rsrcmgr, spec) for font_id, font in fontmap.items(): print( '------------------------------------------------------------' ) print('Descriptor: {}.'.format(font.descriptor)) print('\tFont name: {}, Font type: {}.'.format( font.fontname, type(font).__name__)) if hasattr(font, 'basefont'): print('\tBase font: {}.'.format(font.basefont)) if hasattr(font, 'flags'): print('\tFlags = {}.'.format(font.flags)) if hasattr(font, 'default_width') and hasattr(font, 'widths'): print('\tDefault width = {}, Widths = {}.'.format( font.default_width, font.widths)) print('\tAscent: {}, {}.'.format(font.ascent, font.get_ascent())) print('\tDescent: {}, {}.'.format(font.descent, font.get_descent())) if hasattr(font, 'hscale') and hasattr(font, 'vscale'): print('\tScale: {}, {}.'.format(font.hscale, font.vscale)) if hasattr(font, 'leading') and hasattr(font, 'italic_angle'): print('\tLeading = {}, Italic angle = {}.'.format( font.leading, font.italic_angle)) print('\tBbox = {}.'.format(font.bbox)) if hasattr(font, 'get_width') and hasattr(font, 'get_height'): print('\t(width, height) = ({}, {}).'.format( font.get_width(), font.get_height())) if hasattr(font, 'is_multibyte') and hasattr( font, 'is_vertical'): print('\tis_multibyte = {}, is_vertical = {}.'.format( font.is_multibyte(), font.is_vertical())) if hasattr(font, 'cid2unicode') and hasattr( font, 'unicode_map'): print('\tcid2unicode = {}, unicode_map = {}.'.format( font.cid2unicode, font.unicode_map)) #if hasattr(font, 'char_disp'): # print('\tchar_disp({}) = {}.'.format(cid, font.char_disp(cid))) #if hasattr(font, 'to_unichr'): # print('\tto_unichr({}) = {}.'.format(cid, font.to_unichr(cid))) #if hasattr(font, 'char_width') and hasattr(font, 'string_width'): # print('\tchar_width({}) = {}, string_width({}) = {}.'.format(cid, font.char_width(cid), s, font.string_width(s))) for cs_id, cs in csmap.items(): print('CS ID: {}.'.format(cs_id)) print('\t{}.'.format(cs)) for xobj_id, xobj in xobjmap.items(): print('XObj ID: {}.'.format(xobj_id)) print('\t{}.'.format(xobj)) except FileNotFoundError as ex: print('File not found, {}: {}.'.format(pdf_filepath, ex)) except Exception as ex: print('Unknown exception raised in {}: {}.'.format(pdf_filepath, ex)) finally: if fp: fp.close()
def get_font(self, objid, spec): font = PDFResourceManager.get_font(self, objid, spec) # Correct broken fond - either it has an Encoding or a Unicode_map for text extraction if literal_name(spec['Encoding']) == 'WinAnsiEncoding': font.unicode_map = None return font
def get_font(self, objid, spec): font = PDFResourceManager.get_font(self, objid, spec) font.unicode_map = None return font