def set_parser(self, parser): "Set the document to use a given PDFParser object." if self._parser: return self._parser = parser # Retrieve the information of each header that was appended # (maybe multiple times) at the end of the document. self.xrefs = parser.read_xref() for xref in self.xrefs: trailer = xref.get_trailer() if not trailer: continue # If there's an encryption info, remember it. if 'Encrypt' in trailer: #assert not self.encryption self.encryption = (list_value(trailer['ID']), dict_value(trailer['Encrypt'])) if 'Info' in trailer: self.info.append(dict_value(trailer['Info'])) if 'Root' in trailer: # Every PDF file must have exactly one /Root dictionary. self.catalog = dict_value(trailer['Root']) break else: raise PDFSyntaxError('No /Root object! - Is this really a PDF?') if self.catalog.get('Type') is not LITERAL_CATALOG: if STRICT: raise PDFSyntaxError('Catalog not found!') return
def do_Do(self, xobjid): xobjid = literal_name(xobjid) try: xobj = stream_value(self.xobjmap[xobjid]) except KeyError: if STRICT: raise PDFInterpreterError('Undefined xobject id: %r' % xobjid) return if 1 <= self.debug: print(('Processing xobj: %r' % xobj)) subtype = xobj.get('Subtype') if subtype is LITERAL_FORM and 'BBox' in xobj: interpreter = self.dup() bbox = list_value(xobj['BBox']) matrix = list_value(xobj.get('Matrix', MATRIX_IDENTITY)) # According to PDF reference 1.7 section 4.9.1, XObjects in # earlier PDFs (prior to v1.2) use the page's Resources entry # instead of having their own Resources entry. resources = dict_value(xobj.get('Resources')) or self.resources.copy() self.device.begin_figure(xobjid, bbox, matrix) interpreter.render_contents(resources, [xobj], ctm=mult_matrix(matrix, self.ctm)) self.device.end_figure(xobjid) elif subtype is LITERAL_IMAGE and 'Width' in xobj and 'Height' in xobj: self.device.begin_figure(xobjid, (0,0,1,1), MATRIX_IDENTITY) self.device.render_image(xobjid, xobj) self.device.end_figure(xobjid) else: # unsupported xobject type. pass return
def __init__(self, doc, pageid, attrs): """Initialize a page object. doc: a PDFDocument object. pageid: any Python object that can uniquely identify the page. attrs: a dictionary of page attributes. """ self.doc = doc self.pageid = pageid self.attrs = dict_value(attrs) self.lastmod = resolve1(self.attrs.get('LastModified')) self.resources = resolve1(self.attrs['Resources']) self.mediabox = resolve1(self.attrs['MediaBox']) if 'CropBox' in self.attrs: self.cropbox = resolve1(self.attrs['CropBox']) else: self.cropbox = self.mediabox self.rotate = (self.attrs.get('Rotate', 0)+360) % 360 self.annots = self.attrs.get('Annots') self.beads = self.attrs.get('B') if 'Contents' in self.attrs: contents = resolve1(self.attrs['Contents']) else: contents = [] if not isinstance(contents, list): contents = [ contents ] self.contents = contents return
def search(obj, parent): if isinstance(obj, int): objid = obj tree = dict_value(self.getobj(objid)).copy() else: objid = obj.objid tree = dict_value(obj).copy() for (k,v) in list(parent.items()): if k in self.INHERITABLE_ATTRS and k not in tree: tree[k] = v if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree: if 1 <= self.debug: print(('Pages: Kids=%r' % tree['Kids'])) for c in list_value(tree['Kids']): for x in search(c, tree): yield x elif tree.get('Type') is LITERAL_PAGE: if 1 <= self.debug: print(('Page: %r' % tree)) yield (objid, tree)
def lookup_name(self, cat, key): try: names = dict_value(self.catalog['Names']) except (PDFTypeError, KeyError): raise KeyError((cat,key)) # may raise KeyError d0 = dict_value(names[cat]) def lookup(d): if 'Limits' in d: (k1,k2) = list_value(d['Limits']) if key < k1 or k2 < key: return None if 'Names' in d: objs = list_value(d['Names']) names = dict(choplist(2, objs)) return names[key] if 'Kids' in d: for c in list_value(d['Kids']): v = lookup(dict_value(c)) if v: return v raise KeyError((cat,key)) return lookup(d0)
def load_trailer(self, parser): try: (_,kwd) = parser.nexttoken() assert kwd is self.KEYWORD_TRAILER (_,dic) = parser.nextobject() except PSEOF: x = parser.pop(1) if not x: raise PDFNoValidXRef('Unexpected EOF - file corrupted') (_,dic) = x[0] self.trailer.update(dict_value(dic)) return
def get_dest(self, name): try: # PDF-1.2 or later obj = self.lookup_name('Dests', name) except KeyError: # PDF-1.1 or prior if 'Dests' not in self.catalog: raise PDFDestinationNotFound(name) d0 = dict_value(self.catalog['Dests']) if name not in d0: raise PDFDestinationNotFound(name) obj = d0[name] return obj
def lookup(d): if 'Limits' in d: (k1,k2) = list_value(d['Limits']) if key < k1 or k2 < key: return None if 'Names' in d: objs = list_value(d['Names']) names = dict(choplist(2, objs)) return names[key] if 'Kids' in d: for c in list_value(d['Kids']): v = lookup(dict_value(c)) if v: return v raise KeyError((cat,key))
def __init__(self, rsrcmgr, spec): firstchar = int_value(spec.get("FirstChar", 0)) lastchar = int_value(spec.get("LastChar", 0)) widths = list_value(spec.get("Widths", [0] * 256)) widths = dict((i + firstchar, w) for (i, w) in enumerate(widths)) if "FontDescriptor" in spec: descriptor = dict_value(spec["FontDescriptor"]) else: descriptor = {"Ascent": 0, "Descent": 0, "FontBBox": spec["FontBBox"]} PDFSimpleFont.__init__(self, descriptor, widths, spec) self.matrix = tuple(list_value(spec.get("FontMatrix"))) (_, self.descent, _, self.ascent) = self.bbox (self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1)) return
def init_resources(self, resources): self.resources = resources self.fontmap = {} self.xobjmap = {} self.csmap = PREDEFINED_COLORSPACE.copy() if not resources: return def get_colorspace(spec): if isinstance(spec, list): name = literal_name(spec[0]) else: name = literal_name(spec) if name == 'ICCBased' and isinstance(spec, list) and 2 <= len(spec): return PDFColorSpace(name, stream_value(spec[1])['N']) elif name == 'DeviceN' and isinstance(spec, list) and 2 <= len(spec): return PDFColorSpace(name, len(list_value(spec[1]))) else: return PREDEFINED_COLORSPACE[name] for (k,v) in list(dict_value(resources).items()): if 2 <= self.debug: print(('Resource: %r: %r' % (k,v))) if k == 'Font': for (fontid,spec) in list(dict_value(v).items()): objid = None if isinstance(spec, PDFObjRef): objid = spec.objid spec = dict_value(spec) self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec) elif k == 'ColorSpace': for (csid,spec) in list(dict_value(v).items()): self.csmap[csid] = get_colorspace(resolve1(spec)) elif k == 'ProcSet': self.rsrcmgr.get_procset(list_value(v)) elif k == 'XObject': for (xobjid,xobjstrm) in list(dict_value(v).items()): self.xobjmap[xobjid] = xobjstrm return
def search(entry, level): entry = dict_value(entry) if 'Title' in entry: if 'A' in entry or 'Dest' in entry: title = decode_text(str_value(entry['Title'])) dest = entry.get('Dest') action = entry.get('A') se = entry.get('SE') yield (level, title, dest, action, se) if 'First' in entry and 'Last' in entry: for x in search(entry['First'], level+1): yield x if 'Next' in entry: for x in search(entry['Next'], level): yield x return
def get_font(self, objid, spec): if objid and objid in self._cached_fonts: font = self._cached_fonts[objid] else: if 2 <= self.debug: print(('get_font: create: objid=%r, spec=%r' % (objid, spec))) if STRICT: if spec['Type'] is not LITERAL_FONT: raise PDFFontError('Type is not /Font') # Create a Font object. if 'Subtype' in spec: subtype = literal_name(spec['Subtype']) else: if STRICT: raise PDFFontError('Font Subtype is not specified.') subtype = 'Type1' if subtype in ('Type1', 'MMType1'): # Type1 Font font = PDFType1Font(self, spec) elif subtype == 'TrueType': # TrueType Font font = PDFTrueTypeFont(self, spec) elif subtype == 'Type3': # Type3 Font font = PDFType3Font(self, spec) elif subtype in ('CIDFontType0', 'CIDFontType2'): # CID Font font = PDFCIDFont(self, spec) elif subtype == 'Type0': # Type0 Font dfonts = list_value(spec['DescendantFonts']) assert dfonts subspec = dict_value(dfonts[0]).copy() for k in ('Encoding', 'ToUnicode'): if k in spec: subspec[k] = resolve1(spec[k]) font = self.get_font(None, subspec) else: if STRICT: raise PDFFontError('Invalid Font spec: %r' % spec) font = PDFType1Font(self, spec) # this is so wrong! if objid and self.caching: self._cached_fonts[objid] = font return font
def __init__(self, rsrcmgr, spec): try: self.basefont = literal_name(spec["BaseFont"]) except KeyError: if STRICT: raise PDFFontError("BaseFont is missing") self.basefont = "unknown" try: (descriptor, widths) = FontMetricsDB.get_metrics(self.basefont) except KeyError: descriptor = dict_value(spec.get("FontDescriptor", {})) firstchar = int_value(spec.get("FirstChar", 0)) lastchar = int_value(spec.get("LastChar", 255)) widths = list_value(spec.get("Widths", [0] * 256)) widths = dict((i + firstchar, w) for (i, w) in enumerate(widths)) PDFSimpleFont.__init__(self, descriptor, widths, spec) if "Encoding" not in spec and "FontFile" in descriptor: # try to recover the missing encoding info from the font file. self.fontfile = stream_value(descriptor.get("FontFile")) length1 = int_value(self.fontfile["Length1"]) data = self.fontfile.get_data()[:length1] parser = Type1FontHeaderParser(StringIO(data)) self.cid2unicode = parser.get_encoding() return
def __init__(self, rsrcmgr, spec): try: self.basefont = literal_name(spec["BaseFont"]) except KeyError: if STRICT: raise PDFFontError("BaseFont is missing") self.basefont = "unknown" self.cidsysteminfo = dict_value(spec.get("CIDSystemInfo", {})) self.cidcoding = "%s-%s" % ( self.cidsysteminfo.get("Registry", "unknown"), self.cidsysteminfo.get("Ordering", "unknown"), ) try: name = literal_name(spec["Encoding"]) except KeyError: if STRICT: raise PDFFontError("Encoding is unspecified") name = "unknown" try: self.cmap = CMapDB.get_cmap(name) except CMapDB.CMapNotFound as e: if STRICT: raise PDFFontError(e) self.cmap = CMap() try: descriptor = dict_value(spec["FontDescriptor"]) except KeyError: if STRICT: raise PDFFontError("FontDescriptor is missing") descriptor = {} ttf = None if "FontFile2" in descriptor: self.fontfile = stream_value(descriptor.get("FontFile2")) ttf = TrueTypeFont(self.basefont, StringIO(self.fontfile.get_data())) self.unicode_map = None if "ToUnicode" in spec: strm = stream_value(spec["ToUnicode"]) self.unicode_map = FileUnicodeMap() CMapParser(self.unicode_map, StringIO(strm.get_data())).run() elif self.cidcoding == "Adobe-Identity": if ttf: try: self.unicode_map = ttf.create_unicode_map() except TrueTypeFont.CMapNotFound: pass else: try: self.unicode_map = CMapDB.get_unicode_map(self.cidcoding, self.cmap.is_vertical()) except CMapDB.CMapNotFound as e: pass self.vertical = self.cmap.is_vertical() if self.vertical: # writing mode: vertical widths = get_widths2(list_value(spec.get("W2", []))) self.disps = dict((cid, (vx, vy)) for (cid, (_, (vx, vy))) in list(widths.items())) (vy, w) = spec.get("DW2", [880, -1000]) self.default_disp = (None, vy) widths = dict((cid, w) for (cid, (w, _)) in list(widths.items())) default_width = w else: # writing mode: horizontal self.disps = {} self.default_disp = 0 widths = get_widths(list_value(spec.get("W", []))) default_width = spec.get("DW", 1000) PDFFont.__init__(self, descriptor, widths, default_width=default_width) return
def do_keyword(self, pos, token): """Handles PDF-related keywords.""" if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF): self.add_results(*self.pop(1)) elif token is self.KEYWORD_ENDOBJ: self.add_results(*self.pop(4)) elif token is self.KEYWORD_NULL: # null object self.push((pos, None)) elif token is self.KEYWORD_R: # reference to indirect object try: ((_,objid), (_,genno)) = self.pop(2) (objid, genno) = (int(objid), int(genno)) obj = PDFObjRef(self.doc, objid, genno) self.push((pos, obj)) except PSSyntaxError: pass elif token is self.KEYWORD_STREAM: # stream object ((_,dic),) = self.pop(1) dic = dict_value(dic) objlen = 0 if not self.fallback: try: objlen = int_value(dic['Length']) except KeyError: if STRICT: raise PDFSyntaxError('/Length is undefined: %r' % dic) self.seek(pos) try: (_, line) = self.nextline() # 'stream' except PSEOF: if STRICT: raise PDFSyntaxError('Unexpected EOF') return pos += len(line) self.fp.seek(pos) data = self.fp.read(objlen) self.seek(pos+objlen) while 1: try: (linepos, line) = self.nextline() except PSEOF: if STRICT: raise PDFSyntaxError('Unexpected EOF') break if 'endstream' in line: i = line.index('endstream') objlen += i data += line[:i] break objlen += len(line) data += line self.seek(pos+objlen) # XXX limit objlen not to exceed object boundary if 2 <= self.debug: print(('Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \ (pos, objlen, dic, data[:10]))) obj = PDFStream(dic, data, self.doc.decipher) self.push((pos, obj)) else: # others self.push((pos, token)) return