def get_outlines(doc, pageslist, pagesdict): result = [] for (_, title, destname, actionref, _) in doc.get_outlines(): if destname is None and actionref: action = pdftypes.resolve1(actionref) if isinstance(action, dict): subtype = action.get('S') if subtype is PSLiteralTable.intern('GoTo'): destname = action.get('D') if destname is None: continue dest = resolve_dest(doc, destname) # consider targets of the form [page /XYZ left top zoom] if dest[1] is PSLiteralTable.intern('XYZ'): (pageref, _, targetx, targety) = dest[:4] if type(pageref) is int: page = pageslist[pageref] elif isinstance(pageref, pdftypes.PDFObjRef): page = pagesdict[pageref.objid] else: sys.stderr.write( 'Warning: unsupported pageref in outline: %s\n' % pageref) page = None if page: pos = Pos(page, targetx, targety) result.append(Outline(title, destname, pos)) return result
def get_outlines(doc, pagesdict): result = [] for (_, title, destname, actionref, _) in doc.get_outlines(): if destname is None and actionref: action = actionref.resolve() if isinstance(action, dict): subtype = action.get('S') if subtype is PSLiteralTable.intern('GoTo'): destname = action.get('D') if destname is None: continue dest = resolve_dest(doc, destname) # consider targets of the form [page /XYZ left top zoom] if dest[1] is PSLiteralTable.intern('XYZ'): (pageref, _, targetx, targety, _) = dest page = pagesdict[pageref.objid] pos = Pos(page, targetx, targety) result.append(Outline(title, destname, pos)) return result
def get_outlines(doc, pagesdict): result = [] for (_, title, destname, actionref, _) in doc.get_outlines(): if destname is None and actionref: action = actionref.resolve() if isinstance(action, dict): subtype = action.get('S') if subtype is PSLiteralTable.intern('GoTo'): destname = action.get('D') if destname is None: continue dest = resolve_dest(doc, destname) pageno = pagesdict[dest[0].objid] (_, _, targetx, targety, _) = dest result.append(Outline(title, destname, pageno, targetx, targety)) return result
#!/usr/bin/env python import sys, zlib from pdfminer.lzw import LZWDecoder from pdfminer.psparser import PSException, PSObject, \ PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \ literal_name, keyword_name, STRICT LITERAL_CRYPT = PSLiteralTable.intern('Crypt') LITERALS_FLATE_DECODE = (PSLiteralTable.intern('FlateDecode'), PSLiteralTable.intern('Fl')) LITERALS_LZW_DECODE = (PSLiteralTable.intern('LZWDecode'), PSLiteralTable.intern('LZW')) LITERALS_ASCII85_DECODE = (PSLiteralTable.intern('ASCII85Decode'), PSLiteralTable.intern('A85')) LITERALS_ASCIIHEX_DECODE = (PSLiteralTable.intern('ASCIIHexDecode'), PSLiteralTable.intern('AHx')) ## PDF Objects ## class PDFObject(PSObject): pass class PDFException(PSException): pass class PDFTypeError(PDFException): pass class PDFValueError(PDFException): pass class PDFNotImplementedError(PSException): pass ## PDFObjRef ## class PDFObjRef(PDFObject): def __init__(self, doc, objid, _): if objid == 0: if STRICT:
from pdfminer.psparser import PSStackParser, PSSyntaxError, PSEOF, \ PSLiteralTable, PSKeywordTable, literal_name, keyword_name, STRICT from pdfminer.pdftypes import PDFException, PDFTypeError, PDFNotImplementedError, \ PDFStream, PDFObjRef, resolve1, decipher_all, \ int_value, float_value, num_value, str_value, list_value, dict_value, stream_value ## Exceptions ## class PDFSyntaxError(PDFException): pass class PDFNoValidXRef(PDFSyntaxError): pass class PDFEncryptionError(PDFException): pass class PDFPasswordIncorrect(PDFEncryptionError): pass # some predefined literals and keywords. LITERAL_OBJSTM = PSLiteralTable.intern('ObjStm') LITERAL_XREF = PSLiteralTable.intern('XRef') LITERAL_PAGE = PSLiteralTable.intern('Page') LITERAL_PAGES = PSLiteralTable.intern('Pages') LITERAL_CATALOG = PSLiteralTable.intern('Catalog') ## XRefs ## class XRefObjRange(object): def __init__(self, start, nobjs): self.start = start self.nobjs = nobjs return def __repr__(self):
from pdfminer.pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont from pdfminer.pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect from pdfminer.pdfcolor import PDFColorSpace, PREDEFINED_COLORSPACE, \ LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK from pdfminer.cmap import CMapDB ## Exceptions ## class PDFResourceError(PDFException): pass class PDFInterpreterError(PDFException): pass ## Constants ## LITERAL_PDF = PSLiteralTable.intern('PDF') LITERAL_TEXT = PSLiteralTable.intern('Text') LITERAL_FONT = PSLiteralTable.intern('Font') LITERAL_FORM = PSLiteralTable.intern('Form') LITERAL_IMAGE = PSLiteralTable.intern('Image') ## PDFTextState ## class PDFTextState(object): def __init__(self): self.font = None self.fontsize = 0 self.charspace = 0 self.wordspace = 0
#!/usr/bin/env python import sys from pdfminer.psparser import PSLiteralTable ## PDFColorSpace ## LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray') LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB') LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK') class PDFColorSpace(object): def __init__(self, name, ncomponents): self.name = name self.ncomponents = ncomponents return def __repr__(self): return '<PDFColorSpace: %s, ncomponents=%d>' % (self.name, self.ncomponents) PREDEFINED_COLORSPACE = dict( (name, PDFColorSpace(name, n)) for (name, n) in { 'CalRGB': 3, 'CalGray': 1, 'Lab': 3, 'DeviceRGB': 3, 'DeviceCMYK': 4, 'DeviceGray': 1, 'Separation': 1,
#!/usr/bin/env python import sys from pdfminer.psparser import PSLiteralTable ## PDFColorSpace ## LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray') LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB') LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK') class PDFColorSpace(object): def __init__(self, name, ncomponents): self.name = name self.ncomponents = ncomponents return def __repr__(self): return '<PDFColorSpace: %s, ncomponents=%d>' % (self.name, self.ncomponents) PREDEFINED_COLORSPACE = dict( (name, PDFColorSpace(name,n)) for (name,n) in { 'CalRGB': 3, 'CalGray': 1, 'Lab': 3, 'DeviceRGB': 3, 'DeviceCMYK': 4, 'DeviceGray': 1, 'Separation': 1,
char2gid[c] = (unpack('>H', fp.read(2))[0] + idd) & 0xffff else: for c in xrange(sc, ec+1): char2gid[c] = (c + idd) & 0xffff gid2char = dict( (gid, pack('>H', char)) for (char,gid) in char2gid.iteritems() ) return CMap().update(char2gid, gid2char) ## Fonts ## class PDFFontError(PDFException): pass class PDFUnicodeNotDefined(PDFFontError): pass LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding') LITERAL_TYPE1C = PSLiteralTable.intern('Type1C') # PDFFont class PDFFont(object): def __init__(self, descriptor, widths, default_width=None): self.descriptor = descriptor self.widths = widths self.fontname = descriptor.get('FontName', 'unknown') if isinstance(self.fontname, PSLiteral): self.fontname = literal_name(self.fontname) self.ascent = num_value(descriptor.get('Ascent', 0)) self.descent = num_value(descriptor.get('Descent', 0)) self.default_width = default_width or descriptor.get('MissingWidth', 0)
class PDFNoValidXRef(PDFSyntaxError): pass class PDFEncryptionError(PDFException): pass class PDFPasswordIncorrect(PDFEncryptionError): pass # some predefined literals and keywords. LITERAL_OBJSTM = PSLiteralTable.intern('ObjStm') LITERAL_XREF = PSLiteralTable.intern('XRef') LITERAL_PAGE = PSLiteralTable.intern('Page') LITERAL_PAGES = PSLiteralTable.intern('Pages') LITERAL_CATALOG = PSLiteralTable.intern('Catalog') ## XRefs ## class XRefObjRange(object): def __init__(self, start, nobjs): self.start = start self.nobjs = nobjs return def __repr__(self):
return CMap().update(char2gid, gid2char) ## Fonts ## class PDFFontError(PDFException): pass class PDFUnicodeNotDefined(PDFFontError): pass LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding') LITERAL_TYPE1C = PSLiteralTable.intern('Type1C') # PDFFont class PDFFont(object): def __init__(self, descriptor, widths, default_width=None): self.descriptor = descriptor self.widths = widths self.fontname = descriptor.get('FontName', 'unknown') if isinstance(self.fontname, PSLiteral): self.fontname = literal_name(self.fontname) self.ascent = num_value(descriptor.get('Ascent', 0)) self.descent = num_value(descriptor.get('Descent', 0)) self.default_width = default_width or descriptor.get('MissingWidth', 0) self.leading = num_value(descriptor.get('Leading', 0))