def test_get_cmap_from_pickle(): """Test if cmap file is read from pdfminer/cmap Regression test for https://github.com/pdfminer/pdfminer.six/issues/391 """ cmap_name = 'UniGB-UCS2-H' spec = {'Encoding': PSLiteral(cmap_name)} resource_manager = PDFResourceManager() font = PDFCIDFont(resource_manager, spec) cmap = font.get_cmap_from_spec(spec, False) assert_equal(cmap.attrs.get('CMapName'), cmap_name) assert_greater(len(cmap.code2cid), 0)
def get_font(self, objid, spec): if objid and objid in self._cached_fonts: font = self._cached_fonts[objid] else: if settings.STRICT: if spec['Type'] is not LITERAL_FONT: raise PDFFontError('Type is not /Font') # Create a Font object. if 'Subtype' in spec: subtype = literal_name(spec['Subtype']) else: if settings.STRICT: raise PDFFontError('Font Subtype is not specified.') subtype = 'Type1' if subtype in ('Type1', 'MMType1'): # Type1 Font font = PDFType1Font(self, spec) elif subtype == 'TrueType': # TrueType Font font = PDFTrueTypeFont(self, spec) elif subtype == 'Type3': # Type3 Font font = PDFType3Font(self, spec) elif subtype in ('CIDFontType0', 'CIDFontType2'): # CID Font - Ensure recursive object references have been resolved if type(spec['CIDSystemInfo']) is not PDFObjRef: for k in spec['CIDSystemInfo']: if type(spec['CIDSystemInfo'][k]) is PDFObjRef: spec['CIDSystemInfo'][k] = spec['CIDSystemInfo'][ k].resolve() font = PDFCIDFont(self, spec) elif subtype == 'Type0': # Type0 Font dfonts = list_value(spec['DescendantFonts']) assert dfonts subspec = dict_value(dfonts[0]).copy() for k in ('Encoding', 'ToUnicode'): if k in spec: subspec[k] = resolve1(spec[k]) font = self.get_font(None, subspec) else: if settings.STRICT: raise PDFFontError('Invalid Font spec: %r' % spec) font = PDFType1Font(self, spec) if objid and self.caching: self._cached_fonts[objid] = font return font
def get_font(self, objid, spec): if objid and objid in self.fonts: font = self.fonts[objid] else: if STRICT: if spec['Type'] is not LITERAL_FONT: raise PDFFontError('Type is not /Font') # Create a Font object. if 'Subtype' in spec: subtype = literal_name(spec['Subtype']) else: if STRICT: raise PDFFontError('Font Subtype is not specified.') subtype = 'Type1' if subtype in ('Type1', 'MMType1'): # Type1 Font font = PDFType1Font(self, spec) elif subtype == 'TrueType': # TrueType Font font = PDFTrueTypeFont(self, spec) elif subtype == 'Type3': # Type3 Font font = PDFType3Font(self, spec) elif subtype in ('CIDFontType0', 'CIDFontType2'): # CID Font font = PDFCIDFont(self, spec) elif subtype == 'Type0': # Type0 Font dfonts = list_value(spec['DescendantFonts']) assert dfonts subspec = dict_value(dfonts[0]).copy() for k in ('Encoding', 'ToUnicode'): if k in spec: subspec[k] = resolve1(spec[k]) font = self.get_font(None, subspec) else: if STRICT: raise PDFFontError('Invalid Font spec: %r' % spec) font = PDFType1Font(self, spec) # this is so wrong! if objid: self.fonts[objid] = font return font
def test_encoding_DLIdentH_as_PSLiteral_stream(self): stream = PDFStream({'CMapName':PSLiteral('DLIdent-H')}, '') spec = {'Encoding': stream} font = PDFCIDFont(None, spec) assert isinstance(font.cmap, IdentityCMap)
def test_encoding_DLIdentV_as_stream(self): stream = PDFStream({'CMapName':'DLIdent-V'}, '') spec = {'Encoding': stream} font = PDFCIDFont(None, spec) assert isinstance(font.cmap, IdentityCMap)
def test_encoding_DLIdentV(self): spec = {'Encoding': PSLiteral('DLIdent-V')} font = PDFCIDFont(None, spec) assert isinstance(font.cmap, IdentityCMap)
def test_cmapname_H(self): stream = PDFStream({'CMapName': PSLiteral('H')}, '') spec = {'Encoding': stream} font = PDFCIDFont(None, spec) assert isinstance(font.cmap, CMap)
def test_cmapname_onebyteidentityH(self): stream = PDFStream({'CMapName': PSLiteral('OneByteIdentityH')}, '') spec = {'Encoding': stream} font = PDFCIDFont(None, spec) assert isinstance(font.cmap, IdentityCMapByte)
def test_font_without_spec(self): font = PDFCIDFont(None, {}) assert isinstance(font.cmap, CMap)
def resource_example(): from pdfminer.pdffont import CFFFont, TrueTypeFont from pdfminer.pdffont import PDFFont, PDFSimpleFont, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont from pdfminer.psparser import literal_name from pdfminer.pdftypes import PDFObjRef from pdfminer.pdftypes import list_value, dict_value, stream_value from pdfminer.pdfcolor import PDFColorSpace from pdfminer.pdfcolor import PREDEFINED_COLORSPACE font_filepath = '/path/to/font.ttf' with open(font_filepath, 'rb') as fp: #font = CFFFont(font_filepath, fp) font = TrueTypeFont(font_filepath, fp) print('Font type = {}.'.format(font.fonttype)) print('Font fp = {}.'.format(font.fp)) print('Font name = {}.'.format(font.name)) print('Font tables = {}.'.format(font.tables)) #-------------------- pdf_filepath = '/path/to/sample.pdf' fp = None try: # Open a PDF file. fp = open(pdf_filepath, 'rb') # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() pages = PDFPage.get_pages( fp, pagenos=None, maxpages=0, password=b'' ) # pagenos uses zero-based indices. pagenos is sorted inside the function. page = next(pages) if page: resources, contents = page.resources, page.contents if not resources: print('No resource.') return if contents: print('Contents: {}.'.format(contents)) #for ct in contents: # print(ct.resolve()) # REF [function] >> pdfminer.pdfinterp.PDFPageInterpreter.init_resources() def get_colorspace(spec): if isinstance(spec, list): name = literal_name(spec[0]) else: name = literal_name(spec) if name == 'ICCBased' and isinstance(spec, list) and 2 <= len(spec): return PDFColorSpace(name, stream_value(spec[1])['N']) elif name == 'DeviceN' and isinstance(spec, list) and 2 <= len(spec): return PDFColorSpace(name, len(list_value(spec[1]))) else: return PREDEFINED_COLORSPACE[name] fontmap, xobjmap = dict(), dict() csmap = PREDEFINED_COLORSPACE.copy() for (k, v) in dict_value(resources).items(): #if 2 <= self.debug: # print >>stderr, 'Resource: %r: %r' % (k,v) if k == 'Font': for (font_id, spec) in dict_value(v).items(): obj_id = None if isinstance(spec, PDFObjRef): obj_id = spec.objid spec = dict_value(spec) fontmap[font_id] = rsrcmgr.get_font(obj_id, spec) elif k == 'ColorSpace': for (cs_id, spec) in dict_value(v).items(): csmap[cs_id] = get_colorspace(resolve1(spec)) elif k == 'ProcSet': rsrcmgr.get_procset(list_value(v)) elif k == 'XObject': for (xobj_id, xobjstrm) in dict_value(v).items(): xobjmap[xobj_id] = xobjstrm #spec = ... #if 'FontDescriptor' in spec: # print('FontDescriptor: {}.'.format(spec['FontDescriptor'].resolve())) font = PDFType1Font(rsrcmgr, spec) font = PDFTrueTypeFont(rsrcmgr, spec) #font = PDFType3Font(rsrcmgr, spec) font = PDFCIDFont(rsrcmgr, spec) for font_id, font in fontmap.items(): print( '------------------------------------------------------------' ) print('Descriptor: {}.'.format(font.descriptor)) print('\tFont name: {}, Font type: {}.'.format( font.fontname, type(font).__name__)) if hasattr(font, 'basefont'): print('\tBase font: {}.'.format(font.basefont)) if hasattr(font, 'flags'): print('\tFlags = {}.'.format(font.flags)) if hasattr(font, 'default_width') and hasattr(font, 'widths'): print('\tDefault width = {}, Widths = {}.'.format( font.default_width, font.widths)) print('\tAscent: {}, {}.'.format(font.ascent, font.get_ascent())) print('\tDescent: {}, {}.'.format(font.descent, font.get_descent())) if hasattr(font, 'hscale') and hasattr(font, 'vscale'): print('\tScale: {}, {}.'.format(font.hscale, font.vscale)) if hasattr(font, 'leading') and hasattr(font, 'italic_angle'): print('\tLeading = {}, Italic angle = {}.'.format( font.leading, font.italic_angle)) print('\tBbox = {}.'.format(font.bbox)) if hasattr(font, 'get_width') and hasattr(font, 'get_height'): print('\t(width, height) = ({}, {}).'.format( font.get_width(), font.get_height())) if hasattr(font, 'is_multibyte') and hasattr( font, 'is_vertical'): print('\tis_multibyte = {}, is_vertical = {}.'.format( font.is_multibyte(), font.is_vertical())) if hasattr(font, 'cid2unicode') and hasattr( font, 'unicode_map'): print('\tcid2unicode = {}, unicode_map = {}.'.format( font.cid2unicode, font.unicode_map)) #if hasattr(font, 'char_disp'): # print('\tchar_disp({}) = {}.'.format(cid, font.char_disp(cid))) #if hasattr(font, 'to_unichr'): # print('\tto_unichr({}) = {}.'.format(cid, font.to_unichr(cid))) #if hasattr(font, 'char_width') and hasattr(font, 'string_width'): # print('\tchar_width({}) = {}, string_width({}) = {}.'.format(cid, font.char_width(cid), s, font.string_width(s))) for cs_id, cs in csmap.items(): print('CS ID: {}.'.format(cs_id)) print('\t{}.'.format(cs)) for xobj_id, xobj in xobjmap.items(): print('XObj ID: {}.'.format(xobj_id)) print('\t{}.'.format(xobj)) except FileNotFoundError as ex: print('File not found, {}: {}.'.format(pdf_filepath, ex)) except Exception as ex: print('Unknown exception raised in {}: {}.'.format(pdf_filepath, ex)) finally: if fp: fp.close()