def set_parser(self, parser): "Set the document to use a given PDFParser object." if self._parser: return self._parser = parser # Retrieve the information of each header that was appended # (maybe multiple times) at the end of the document. self.xrefs = parser.read_xref() for xref in self.xrefs: trailer = xref.get_trailer() if not trailer: continue # If there's an encryption info, remember it. if 'Encrypt' in trailer: #assert not self.encryption self.encryption = (list_value(trailer['ID']), dict_value(trailer['Encrypt'])) if 'Info' in trailer: self.info.append(dict_value(trailer['Info'])) if 'Root' in trailer: # Every PDF file must have exactly one /Root dictionary. self.catalog = dict_value(trailer['Root']) break else: raise PDFSyntaxError('No /Root object! - Is this really a PDF?') if self.catalog.get('Type') is not LITERAL_CATALOG: if STRICT: raise PDFSyntaxError('Catalog not found!') return
def init_resources(self, resources): self.fontmap = {} self.xobjmap = {} self.csmap = PREDEFINED_COLORSPACE.copy() if not resources: return def get_colorspace(spec): if isinstance(spec, list): name = literal_name(spec[0]) else: name = literal_name(spec) if name == 'ICCBased' and isinstance(spec, list) and 2 <= len(spec): return PDFColorSpace(name, stream_value(spec[1])['N']) elif name == 'DeviceN' and isinstance(spec, list) and 2 <= len(spec): return PDFColorSpace(name, len(list_value(spec[1]))) else: return PREDEFINED_COLORSPACE[name] for (k,v) in dict_value(resources).iteritems(): if 1 <= self.debug: print >>stderr, 'Resource: %r: %r' % (k,v) if k == 'Font': for (fontid,spec) in dict_value(v).iteritems(): objid = None if isinstance(spec, PDFObjRef): objid = spec.objid spec = dict_value(spec) self.fontmap[fontid] = self.rsrc.get_font(objid, spec) elif k == 'ColorSpace': for (csid,spec) in dict_value(v).iteritems(): self.csmap[csid] = get_colorspace(resolve1(spec)) elif k == 'ProcSet': self.rsrc.get_procset(list_value(v)) elif k == 'XObject': for (xobjid,xobjstrm) in dict_value(v).iteritems(): self.xobjmap[xobjid] = xobjstrm return
def lookup_name(self, cat, key): try: names = dict_value(self.catalog['Names']) except (PDFTypeError, KeyError): raise KeyError((cat, key)) # may raise KeyError d0 = dict_value(names[cat]) def lookup(d): if 'Limits' in d: (k1, k2) = list_value(d['Limits']) if key < k1 or k2 < key: return None if 'Names' in d: objs = list_value(d['Names']) names = dict(choplist(2, objs)) return names[key] if 'Kids' in d: for c in list_value(d['Kids']): v = lookup(dict_value(c)) if v: return v raise KeyError((cat, key)) return lookup(d0)
def set_parser(self, parser): "Set the document to use a given PDFParser object." if self.parser: return self.parser = parser # The document is set to be temporarily ready during collecting # all the basic information about the document, e.g. # the header, the encryption information, and the access rights # for the document. self._initialized = True # Retrieve the information of each header that was appended # (maybe multiple times) at the end of the document. self.xrefs = parser.read_xref() for xref in self.xrefs: trailer = xref.get_trailer() if not trailer: continue # If there's an encryption info, remember it. if 'Encrypt' in trailer: #assert not self.encryption self.encryption = (list_value(trailer['ID']), dict_value(trailer['Encrypt'])) if 'Root' in trailer: self.set_root(dict_value(trailer['Root'])) break else: raise PDFSyntaxError('No /Root object! - Is this really a PDF?') # The document is set to be non-ready again, until all the # proper initialization (asking the password key and # verifying the access permission, so on) is finished. self._initialized = False return
def __init__(self, parser, password='', caching=True, fallback=True, dbg=False): if dbg: print 'PDFDocument() debugging enabled' debug = 3 "Set the document to use a given PDFParser object." self.caching = caching self.eof_distance = 0 self.found_eof = False self.xrefs = [] self.info = [] self.errors = [] self.catalog = None self.encryption = None self.decipher = None self._parser = None self._cached_objs = {} self._parsed_objs = {} self._parser = parser self._parser.set_document(self) self.is_printable = self.is_modifiable = self.is_extractable = True # Retrieve the information of each header that was appended # (maybe multiple times) at the end of the document. try: pos = self.find_xref(parser) self.read_xref_from(parser, pos, self.xrefs) except PDFNoValidXRef: fallback = True except PSEOF: self.errors.append( "<PDFMiner><PDFDocument><__init__>Error reading xref table: Possible malformed table</__init__></PDFDocument></PDFMiner>") if fallback: parser.fallback = True xref = PDFXRefFallback() xref.load(parser) self.xrefs.append(xref) for xref in self.xrefs: trailer = xref.get_trailer() if not trailer: continue # If there's an encryption info, remember it. if 'Encrypt' in trailer: # assert not self.encryption self.encryption = (list_value(trailer['ID']), dict_value(trailer['Encrypt'])) self._initialize_password(password) if 'Info' in trailer: self.info.append(dict_value(trailer['Info'])) if 'Root' in trailer: # Every PDF file must have exactly one /Root dictionary. self.catalog = dict_value(trailer['Root']) break # else: # raise PDFSyntaxError('No /Root object! - Is this really a PDF?') if self.catalog and self.catalog.get('Type') is not LITERAL_CATALOG: if STRICT: raise PDFSyntaxError('Catalog not found!') return
def do_Do(self, xobjid): xobjid = literal_name(xobjid) try: xobj = stream_value(self.xobjmap[xobjid]) except KeyError: if STRICT: raise PDFInterpreterError('Undefined xobject id: %r' % xobjid) return if 1 <= self.debug: print >> stderr, 'Processing xobj: %r' % xobj subtype = xobj.dic.get('Subtype') if subtype is LITERAL_FORM and 'BBox' in xobj.dic: interpreter = self.dup() (x0, y0, x1, y1) = list_value(xobj.dic['BBox']) ctm = mult_matrix( list_value(xobj.dic.get('Matrix', MATRIX_IDENTITY)), self.ctm) (x0, y0) = apply_matrix(ctm, (x0, y0)) (x1, y1) = apply_matrix(ctm, (x1, y1)) bbox = (x0, y0, x1, y1) self.device.begin_figure(xobjid, bbox) interpreter.render_contents(dict_value(xobj.dic.get('Resources')), [xobj], ctm=ctm) self.device.end_figure(xobjid) elif subtype is LITERAL_IMAGE and 'Width' in xobj.dic and 'Height' in xobj.dic: (x0, y0) = apply_matrix(self.ctm, (0, 0)) (x1, y1) = apply_matrix(self.ctm, (1, 1)) self.device.begin_figure(xobjid, (x0, y0, x1, y1)) (w, h) = (xobj.dic['Width'], xobj.dic['Height']) self.device.render_image(xobj, (w, h), self.ctm) self.device.end_figure(xobjid) else: # unsupported xobject type. pass return
def do_Do(self, xobjid): xobjid = literal_name(xobjid) try: xobj = stream_value(self.xobjmap[xobjid]) except KeyError: if STRICT: raise PDFInterpreterError('Undefined xobject id: %r' % xobjid) return if 1 <= self.debug: print >>stderr, 'Processing xobj: %r' % xobj subtype = xobj.get('Subtype') if subtype is LITERAL_FORM and 'BBox' in xobj: interpreter = self.dup() bbox = list_value(xobj['BBox']) matrix = list_value(xobj.get('Matrix', MATRIX_IDENTITY)) self.device.begin_figure(xobjid, bbox, matrix) interpreter.render_contents(dict_value(xobj.get('Resources')), [xobj], ctm=mult_matrix(matrix, self.ctm)) self.device.end_figure(xobjid) elif subtype is LITERAL_IMAGE and 'Width' in xobj and 'Height' in xobj: self.device.begin_figure(xobjid, (0,0,1,1), MATRIX_IDENTITY) self.device.render_image(xobjid, xobj) self.device.end_figure(xobjid) else: # unsupported xobject type. pass return
def set_root(self, root): self.root = root self.catalog = dict_value(self.root) if self.catalog.get('Type') is not LITERAL_CATALOG: if STRICT: raise PDFSyntaxError('Catalog not found!') return
def __init__(self, doc, pageid, attrs): """Initialize a page object. doc: a PDFDocument object. pageid: any Python object that can uniquely identify the page. attrs: a dictionary of page attributes. """ self.doc = doc self.pageid = pageid self.attrs = dict_value(attrs) self.lastmod = resolve1(self.attrs.get('LastModified')) self.resources = resolve1(self.attrs['Resources']) self.mediabox = resolve1(self.attrs['MediaBox']) if 'CropBox' in self.attrs: self.cropbox = resolve1(self.attrs['CropBox']) else: self.cropbox = self.mediabox self.rotate = (self.attrs.get('Rotate', 0)+360) % 360 self.annots = self.attrs.get('Annots') self.beads = self.attrs.get('B') if 'Contents' in self.attrs: contents = resolve1(self.attrs['Contents']) else: contents = [] if not isinstance(contents, list): contents = [ contents ] self.contents = contents return
def do_Do(self, xobjid): xobjid = literal_name(xobjid) try: xobj = stream_value(self.xobjmap[xobjid]) except KeyError: if STRICT: raise PDFInterpreterError("Undefined xobject id: %r" % xobjid) return if 1 <= self.debug: print >> stderr, "Processing xobj: %r" % xobj subtype = xobj.dic.get("Subtype") if subtype is LITERAL_FORM and "BBox" in xobj.dic: interpreter = self.dup() (x0, y0, x1, y1) = list_value(xobj.dic["BBox"]) ctm = mult_matrix(list_value(xobj.dic.get("Matrix", MATRIX_IDENTITY)), self.ctm) (x0, y0) = apply_matrix(ctm, (x0, y0)) (x1, y1) = apply_matrix(ctm, (x1, y1)) bbox = (x0, y0, x1, y1) self.device.begin_figure(xobjid, bbox) interpreter.render_contents(dict_value(xobj.dic.get("Resources")), [xobj], ctm=ctm) self.device.end_figure(xobjid) elif subtype is LITERAL_IMAGE and "Width" in xobj.dic and "Height" in xobj.dic: (x0, y0) = apply_matrix(self.ctm, (0, 0)) (x1, y1) = apply_matrix(self.ctm, (1, 1)) self.device.begin_figure(xobjid, (x0, y0, x1, y1)) (w, h) = (xobj.dic["Width"], xobj.dic["Height"]) self.device.render_image(xobj, (w, h), self.ctm) self.device.end_figure(xobjid) else: # unsupported xobject type. pass return
def do_Do(self, xobjid): xobjid = literal_name(xobjid) try: xobj = stream_value(self.xobjmap[xobjid]) except KeyError: if STRICT: raise PDFInterpreterError('Undefined xobject id: %r' % xobjid) return if 1 <= self.debug: print >>sys.stderr, 'Processing xobj: %r' % xobj subtype = xobj.get('Subtype') if subtype is LITERAL_FORM and 'BBox' in xobj: interpreter = self.dup() bbox = list_value(xobj['BBox']) matrix = list_value(xobj.get('Matrix', MATRIX_IDENTITY)) # According to PDF reference 1.7 section 4.9.1, XObjects in # earlier PDFs (prior to v1.2) use the page's Resources entry # instead of having their own Resources entry. resources = dict_value(xobj.get('Resources')) or self.resources.copy() self.device.begin_figure(xobjid, bbox, matrix) interpreter.render_contents(resources, [xobj], ctm=mult_matrix(matrix, self.ctm)) self.device.end_figure(xobjid) elif subtype is LITERAL_IMAGE and 'Width' in xobj and 'Height' in xobj: self.device.begin_figure(xobjid, (0,0,1,1), MATRIX_IDENTITY) self.device.render_image(xobjid, xobj) self.device.end_figure(xobjid) else: # unsupported xobject type. pass return
def __init__(self, doc, pageid, attrs): """Initialize a page object. doc: a PDFDocument object. pageid: any Python object that can uniquely identify the page. attrs: a dictionary of page attributes. """ self.doc = doc self.pageid = pageid self.attrs = dict_value(attrs) self.lastmod = resolve1(self.attrs.get("LastModified")) self.resources = resolve1(self.attrs["Resources"]) self.mediabox = resolve1(self.attrs["MediaBox"]) if "CropBox" in self.attrs: self.cropbox = resolve1(self.attrs["CropBox"]) else: self.cropbox = self.mediabox self.rotate = (int_value(self.attrs.get("Rotate", 0)) + 360) % 360 self.annots = self.attrs.get("Annots") self.beads = self.attrs.get("B") if "Contents" in self.attrs: contents = resolve1(self.attrs["Contents"]) else: contents = [] if not isinstance(contents, list): contents = [contents] self.contents = contents return
def do_keyword(self, pos, token): """Handles PDF-related keywords.""" if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF): self.add_results(*self.pop(1)) elif token is self.KEYWORD_ENDOBJ: self.add_results(*self.pop(4)) elif token is self.KEYWORD_NULL: # null object self.push((pos, None)) elif token is self.KEYWORD_R: # reference to indirect object try: ((_, objid), (_, genno)) = self.pop(2) (objid, genno) = (int(objid), int(genno)) obj = PDFObjRef(self.doc, objid, genno) self.push((pos, obj)) except PSSyntaxError: pass elif token is self.KEYWORD_STREAM: # stream object ((_, dic),) = self.pop(1) dic = dict_value(dic) objlen = 0 if not self.fallback: try: objlen = int_value(dic['Length']) except KeyError: handle_error(PDFSyntaxError, '/Length is undefined: %r' % dic) self.seek(pos) try: (_, line) = self.nextline() # 'stream' except PSEOF: handle_error(PDFSyntaxError, 'Unexpected EOF') return pos += len(line) self.fp.seek(pos) data = self.fp.read(objlen) self.seek(pos + objlen) while 1: try: (linepos, line) = self.nextline() except PSEOF: handle_error(PDFSyntaxError, 'Unexpected EOF') break if 'endstream' in line: i = line.index('endstream') objlen += i data += line[:i] break objlen += len(line) data += line self.seek(pos+objlen) # XXX limit objlen not to exceed object boundary log.debug('Stream: pos=%d, objlen=%d, dic=%r, data=%r...', pos, objlen, dic, data[:10]) obj = PDFStream(dic, data, self.doc.decipher) self.push((pos, obj)) else: # others self.push((pos, token))
def __init__(self, parser, caching=True, fallback=True): "Set the document to use a given PDFParser object." self.caching = caching self.xrefs = [] self.info = [] self.catalog = None self.encryption = None self.decipher = None self._parser = None self._cached_objs = {} self._parsed_objs = {} self._parser = parser self._parser.set_document(self) # Retrieve the information of each header that was appended # (maybe multiple times) at the end of the document. try: pos = self.find_xref(parser) self.read_xref_from(parser, pos, self.xrefs) except PDFNoValidXRef: fallback = True if fallback: parser.fallback = True xref = PDFXRefFallback() xref.load(parser) self.xrefs.append(xref) for xref in self.xrefs: trailer = xref.get_trailer() if not trailer: continue # If there's an encryption info, remember it. if 'Encrypt' in trailer: #assert not self.encryption self.encryption = (list_value(trailer['ID']), dict_value(trailer['Encrypt'])) if 'Info' in trailer: self.info.append(dict_value(trailer['Info'])) if 'Root' in trailer: # Every PDF file must have exactly one /Root dictionary. self.catalog = dict_value(trailer['Root']) break else: raise PDFSyntaxError('No /Root object! - Is this really a PDF?') if self.catalog.get('Type') is not LITERAL_CATALOG: if STRICT: raise PDFSyntaxError('Catalog not found!') return
def search(obj, parent): if isinstance(obj, int): objid = obj tree = dict_value(document.getobj(objid)).copy() else: objid = obj.objid tree = dict_value(obj).copy() for (k, v) in parent.iteritems(): if k in klass.INHERITABLE_ATTRS and k not in tree: tree[k] = v if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree: logging.info('Pages: Kids=%r' % tree['Kids']) for c in list_value(tree['Kids']): for x in search(c, tree): yield x elif tree.get('Type') is LITERAL_PAGE: logging.info('Page: %r' % tree) yield (objid, tree)
def search(obj, parent): if isinstance(obj, int): objid = obj tree = dict_value(self.getobj(objid)).copy() else: objid = obj.objid tree = dict_value(obj).copy() for (k,v) in parent.iteritems(): if k in self.INHERITABLE_ATTRS and k not in tree: tree[k] = v if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree: if 1 <= self.debug: print >>sys.stderr, 'Pages: Kids=%r' % tree['Kids'] for c in list_value(tree['Kids']): for x in search(c, tree): yield x elif tree.get('Type') is LITERAL_PAGE: if 1 <= self.debug: print >>sys.stderr, 'Page: %r' % tree yield (objid, tree)
def load_trailer(self, parser): try: (_, kwd) = parser.nexttoken() assert kwd is self.KEYWORD_TRAILER (_, dic) = parser.nextobject() except PSEOF: x = parser.pop(1) if not x: raise PDFNoValidXRef('Unexpected EOF - file corrupted') (_, dic) = x[0] self.trailer.update(dict_value(dic))
def search(obj, parent): if isinstance(obj, int): objid = obj tree = dict_value(document.getobj(objid)).copy() else: objid = obj.objid tree = dict_value(obj).copy() for (k, v) in parent.iteritems(): if k in klass.INHERITABLE_ATTRS and k not in tree: tree[k] = v if tree.get("Type") is LITERAL_PAGES and "Kids" in tree: if 1 <= debug: print >>sys.stderr, "Pages: Kids=%r" % tree["Kids"] for c in list_value(tree["Kids"]): for x in search(c, tree): yield x elif tree.get("Type") is LITERAL_PAGE: if 1 <= debug: print >>sys.stderr, "Page: %r" % tree yield (objid, tree)
def load_trailer(self, parser): try: (_, kwd) = parser.nexttoken() assert kwd is self.KEYWORD_TRAILER (_, dic) = parser.nextobject() except PSEOF: x = parser.pop(1) if not x: raise PDFNoValidXRef('Unexpected EOF - file corrupted') (_, dic) = x[0] self.trailer.update(dict_value(dic)) return
class PDFCIDFont(PDFFont): def __init__(self, rsrcmgr, spec): try: self.basefont = literal_name(spec['BaseFont']) except KeyError: if STRICT: raise PDFFontError('BaseFont is missing') self.basefont = 'unknown' self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {})) self.cidcoding = '%s-%s' % (self.cidsysteminfo.get( 'Registry', 'unknown'), self.cidsysteminfo.get('Ordering', 'unknown')) try: name = literal_name(spec['Encoding']) except KeyError: if STRICT: raise PDFFontError('Encoding is unspecified') name = 'unknown' try: self.cmap = CMapDB.get_cmap(name) except CMapDB.CMapNotFound, e: if STRICT: raise PDFFontError(e) self.cmap = CMap() try: descriptor = dict_value(spec['FontDescriptor']) except KeyError: if STRICT: raise PDFFontError('FontDescriptor is missing') descriptor = {} ttf = None if 'FontFile2' in descriptor: self.fontfile = stream_value(descriptor.get('FontFile2')) ttf = TrueTypeFont(self.basefont, StringIO(self.fontfile.get_data())) self.unicode_map = None if 'ToUnicode' in spec: strm = stream_value(spec['ToUnicode']) self.unicode_map = FileUnicodeMap() CMapParser(self.unicode_map, StringIO(strm.get_data())).run() elif self.cidcoding in ('Adobe-Identity', 'Adobe-UCS'): if ttf: try: self.unicode_map = ttf.create_unicode_map() except TrueTypeFont.CMapNotFound: pass else: try: self.unicode_map = CMapDB.get_unicode_map( self.cidcoding, self.cmap.is_vertical()) except CMapDB.CMapNotFound, e: pass
def init_resources(self, resources): self.resources = resources self.fontmap = {} self.xobjmap = {} self.csmap = PREDEFINED_COLORSPACE.copy() if not resources: return def get_colorspace(spec): if isinstance(spec, list): name = literal_name(spec[0]) else: name = literal_name(spec) if name == "ICCBased" and isinstance(spec, list) and 2 <= len(spec): return PDFColorSpace(name, stream_value(spec[1])["N"]) elif name == "DeviceN" and isinstance(spec, list) and 2 <= len(spec): return PDFColorSpace(name, len(list_value(spec[1]))) else: return PREDEFINED_COLORSPACE.get(name) for (k, v) in dict_value(resources).iteritems(): if 2 <= self.debug: print >>sys.stderr, "Resource: %r: %r" % (k, v) if k == "Font": for (fontid, spec) in dict_value(v).iteritems(): objid = None if isinstance(spec, PDFObjRef): objid = spec.objid spec = dict_value(spec) self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec) elif k == "ColorSpace": for (csid, spec) in dict_value(v).iteritems(): self.csmap[csid] = get_colorspace(resolve1(spec)) elif k == "ProcSet": self.rsrcmgr.get_procset(list_value(v)) elif k == "XObject": for (xobjid, xobjstrm) in dict_value(v).iteritems(): self.xobjmap[xobjid] = xobjstrm return
def get_dest(self, name): try: # PDF-1.2 or later obj = self.lookup_name('Dests', name) except KeyError: # PDF-1.1 or prior if 'Dests' not in self.catalog: raise PDFDestinationNotFound(name) d0 = dict_value(self.catalog['Dests']) if name not in d0: raise PDFDestinationNotFound(name) obj = d0[name] return obj
def lookup(d): if 'Limits' in d: (k1,k2) = list_value(d['Limits']) if key < k1 or k2 < key: return None if 'Names' in d: objs = list_value(d['Names']) names = dict(choplist(2, objs)) return names[key] if 'Kids' in d: for c in list_value(d['Kids']): v = lookup(dict_value(c)) if v: return v raise KeyError((cat,key))
def lookup(d): if 'Limits' in d: (k1, k2) = list_value(d['Limits']) if key < k1 or k2 < key: return None if 'Names' in d: objs = list_value(d['Names']) names = dict(choplist(2, objs)) return names[key] if 'Kids' in d: for c in list_value(d['Kids']): v = lookup(dict_value(c)) if v: return v raise KeyError((cat, key))
def __init__(self, rsrcmgr, spec): firstchar = int_value(spec.get("FirstChar", 0)) lastchar = int_value(spec.get("LastChar", 0)) widths = list_value(spec.get("Widths", [0] * 256)) widths = dict((i + firstchar, w) for (i, w) in enumerate(widths)) if "FontDescriptor" in spec: descriptor = dict_value(spec["FontDescriptor"]) else: descriptor = {"Ascent": 0, "Descent": 0, "FontBBox": spec["FontBBox"]} PDFSimpleFont.__init__(self, descriptor, widths, spec) self.matrix = tuple(list_value(spec.get("FontMatrix"))) (_, self.descent, _, self.ascent) = self.bbox (self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1)) return
def __init__(self, rsrcmgr, spec): firstchar = int_value(spec.get('FirstChar', 0)) lastchar = int_value(spec.get('LastChar', 0)) widths = list_value(spec.get('Widths', [0] * 256)) widths = dict((i + firstchar, w) for (i, w) in enumerate(widths)) if 'FontDescriptor' in spec: descriptor = dict_value(spec['FontDescriptor']) else: descriptor = {'Ascent': 0, 'Descent': 0, 'FontBBox': spec['FontBBox']} PDFSimpleFont.__init__(self, descriptor, widths, spec) self.matrix = tuple(list_value(spec.get('FontMatrix'))) (_, self.descent, _, self.ascent) = self.bbox (self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1))
def init_resources(self, resources): """Prepare the fonts and XObjects listed in the Resource attribute.""" self.resources = resources self.fontmap = {} self.xobjmap = {} self.csmap = PREDEFINED_COLORSPACE.copy() if not resources: return def get_colorspace(spec): if isinstance(spec, list): name = literal_name(spec[0]) else: name = literal_name(spec) if name == 'ICCBased' and isinstance(spec, list) and 2 <= len(spec): return PDFColorSpace(name, stream_value(spec[1])['N']) elif name == 'DeviceN' and isinstance(spec, list) and 2 <= len(spec): return PDFColorSpace(name, len(list_value(spec[1]))) else: return PREDEFINED_COLORSPACE.get(name) for (k, v) in dict_value(resources).iteritems(): log.debug('Resource: %r: %r', k, v) if k == 'Font': for (fontid, spec) in dict_value(v).iteritems(): objid = None if isinstance(spec, PDFObjRef): objid = spec.objid spec = dict_value(spec) self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec) elif k == 'ColorSpace': for (csid, spec) in dict_value(v).iteritems(): self.csmap[csid] = get_colorspace(resolve1(spec)) elif k == 'ProcSet': self.rsrcmgr.get_procset(list_value(v)) elif k == 'XObject': for (xobjid, xobjstrm) in dict_value(v).iteritems(): self.xobjmap[xobjid] = xobjstrm
def __init__(self, rsrcmgr, spec): firstchar = int_value(spec.get('FirstChar', 0)) lastchar = int_value(spec.get('LastChar', 0)) widths = list_value(spec.get('Widths', [0]*256)) widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths)) if 'FontDescriptor' in spec: descriptor = dict_value(spec['FontDescriptor']) else: descriptor = {'Ascent':0, 'Descent':0, 'FontBBox':spec['FontBBox']} PDFSimpleFont.__init__(self, descriptor, widths, spec) self.matrix = tuple(list_value(spec.get('FontMatrix'))) (_,self.descent,_,self.ascent) = self.bbox (self.hscale,self.vscale) = apply_matrix_norm(self.matrix, (1,1)) return
def search(entry, level): entry = dict_value(entry) if 'Title' in entry: if 'A' in entry or 'Dest' in entry: title = decode_text(str_value(entry['Title'])) dest = entry.get('Dest') action = entry.get('A') se = entry.get('SE') yield (level, title, dest, action, se) if 'First' in entry and 'Last' in entry: for x in search(entry['First'], level + 1): yield x if 'Next' in entry: for x in search(entry['Next'], level): yield x return
def search(entry, level): entry = dict_value(entry) if 'Title' in entry: if 'A' in entry or 'Dest' in entry: title = decode_text(str_value(entry['Title'])) dest = entry.get('Dest') action = entry.get('A') se = entry.get('SE') yield (level, title, dest, action, se) if 'First' in entry and 'Last' in entry: for x in search(entry['First'], level+1): yield x if 'Next' in entry: for x in search(entry['Next'], level): yield x return
def __init__(self, rsrc, spec): try: self.basefont = literal_name(spec['BaseFont']) except KeyError: if STRICT: raise PDFFontError('BaseFont is missing') self.basefont = 'unknown' try: (descriptor, widths) = FontMetricsDB.get_metrics(self.basefont) except KeyError: descriptor = dict_value(spec.get('FontDescriptor', {})) firstchar = int_value(spec.get('FirstChar', 0)) lastchar = int_value(spec.get('LastChar', 255)) widths = list_value(spec.get('Widths', [0]*256)) widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths) ) PDFSimpleFont.__init__(self, descriptor, widths, spec) return
def __init__(self, rsrc, spec): try: self.basefont = literal_name(spec['BaseFont']) except KeyError: if STRICT: raise PDFFontError('BaseFont is missing') self.basefont = 'unknown' try: (descriptor, widths) = FontMetricsDB.get_metrics(self.basefont) except KeyError: descriptor = dict_value(spec.get('FontDescriptor', {})) firstchar = int_value(spec.get('FirstChar', 0)) lastchar = int_value(spec.get('LastChar', 255)) widths = list_value(spec.get('Widths', [0] * 256)) widths = dict((i + firstchar, w) for (i, w) in enumerate(widths)) PDFSimpleFont.__init__(self, descriptor, widths, spec) return
def get_font(self, objid, spec): if objid and objid in self._cached_fonts: font = self._cached_fonts[objid] else: if 2 <= self.debug: print >> sys.stderr, 'get_font: create: objid=%r, spec=%r' % ( objid, spec) if STRICT: if spec['Type'] is not LITERAL_FONT: raise PDFFontError('Type is not /Font') # Create a Font object. if 'Subtype' in spec: subtype = literal_name(spec['Subtype']) else: if STRICT: raise PDFFontError('Font Subtype is not specified.') subtype = 'Type1' if subtype in ('Type1', 'MMType1'): # Type1 Font font = PDFType1Font(self, spec) elif subtype == 'TrueType': # TrueType Font font = PDFTrueTypeFont(self, spec) elif subtype == 'Type3': # Type3 Font font = PDFType3Font(self, spec) elif subtype in ('CIDFontType0', 'CIDFontType2'): # CID Font font = PDFCIDFont(self, spec) elif subtype == 'Type0': # Type0 Font dfonts = list_value(spec['DescendantFonts']) assert dfonts subspec = dict_value(dfonts[0]).copy() for k in ('Encoding', 'ToUnicode'): if k in spec: subspec[k] = resolve1(spec[k]) font = self.get_font(None, subspec) else: if STRICT: raise PDFFontError('Invalid Font spec: %r' % spec) font = PDFType1Font(self, spec) # this is so wrong! if objid and self.caching: self._cached_fonts[objid] = font return font
def init_params(self): super(PDFStandardSecurityHandlerV4, self).init_params() self.length = 128 self.cf = dict_value(self.param.get('CF')) self.stmf = literal_name(self.param['StmF']) self.strf = literal_name(self.param['StrF']) self.encrypt_metadata = bool(self.param.get('EncryptMetadata', True)) if self.stmf != self.strf: raise PDFEncryptionError('Unsupported crypt filter: param=%r' % self.param) self.cfm = {} for k, v in self.cf.items(): f = self.get_cfm(literal_name(v['CFM'])) if f is None: raise PDFEncryptionError('Unknown crypt filter method: param=%r' % self.param) self.cfm[k] = f self.cfm['Identity'] = self.decrypt_identity if self.strf not in self.cfm: raise PDFEncryptionError('Undefined crypt filter: param=%r' % self.param)
def get_font(self, objid, spec): if objid and objid in self._cached_fonts: font = self._cached_fonts[objid] else: if 2 <= self.debug: print >>sys.stderr, "get_font: create: objid=%r, spec=%r" % (objid, spec) if STRICT: if spec["Type"] is not LITERAL_FONT: raise PDFFontError("Type is not /Font") # Create a Font object. if "Subtype" in spec: subtype = literal_name(spec["Subtype"]) else: if STRICT: raise PDFFontError("Font Subtype is not specified.") subtype = "Type1" if subtype in ("Type1", "MMType1"): # Type1 Font font = PDFType1Font(self, spec) elif subtype == "TrueType": # TrueType Font font = PDFTrueTypeFont(self, spec) elif subtype == "Type3": # Type3 Font font = PDFType3Font(self, spec) elif subtype in ("CIDFontType0", "CIDFontType2"): # CID Font font = PDFCIDFont(self, spec) elif subtype == "Type0": # Type0 Font dfonts = list_value(spec["DescendantFonts"]) assert dfonts subspec = dict_value(dfonts[0]).copy() for k in ("Encoding", "ToUnicode"): if k in spec: subspec[k] = resolve1(spec[k]) font = self.get_font(None, subspec) else: if STRICT: raise PDFFontError("Invalid Font spec: %r" % spec) font = PDFType1Font(self, spec) # this is so wrong! if objid and self.caching: self._cached_fonts[objid] = font return font
def get_font(self, objid, spec): if objid and objid in self._cached_fonts: font = self._cached_fonts[objid] else: if 2 <= self.debug: print >>sys.stderr, 'get_font: create: objid=%r, spec=%r' % (objid, spec) if STRICT: if spec['Type'] is not LITERAL_FONT: raise PDFFontError('Type is not /Font') # Create a Font object. if 'Subtype' in spec: subtype = literal_name(spec['Subtype']) else: if STRICT: raise PDFFontError('Font Subtype is not specified.') subtype = 'Type1' if subtype in ('Type1', 'MMType1'): # Type1 Font font = PDFType1Font(self, spec) elif subtype == 'TrueType': # TrueType Font font = PDFTrueTypeFont(self, spec) elif subtype == 'Type3': # Type3 Font font = PDFType3Font(self, spec) elif subtype in ('CIDFontType0', 'CIDFontType2'): # CID Font font = PDFCIDFont(self, spec) elif subtype == 'Type0': # Type0 Font dfonts = list_value(spec['DescendantFonts']) assert dfonts subspec = dict_value(dfonts[0]).copy() for k in ('Encoding', 'ToUnicode'): if k in spec: subspec[k] = resolve1(spec[k]) font = self.get_font(None, subspec) else: if STRICT: raise PDFFontError('Invalid Font spec: %r' % spec) font = PDFType1Font(self, spec) # this is so wrong! if objid and self.caching: self._cached_fonts[objid] = font return font
def __init__(self, rsrcmgr, spec): try: self.basefont = literal_name(spec['BaseFont']) except KeyError: handle_error(PDFFontError, 'BaseFont is missing') self.basefont = 'unknown' self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {})) self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', 'unknown'), self.cidsysteminfo.get('Ordering', 'unknown')) try: name = literal_name(spec['Encoding']) except KeyError: handle_error(PDFFontError, 'Encoding is unspecified') name = 'unknown' try: self.cmap = CMapDB.get_cmap(name) except CMapDB.CMapNotFound, e: handle_error(PDFFontError, str(e)) self.cmap = CMap()
def __init__(self, rsrc, spec): try: self.basefont = literal_name(spec['BaseFont']) except KeyError: if STRICT: raise PDFFontError('BaseFont is missing') self.basefont = 'unknown' self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {})) self.cidcoding = '%s-%s' % (self.cidsysteminfo.get( 'Registry', 'unknown'), self.cidsysteminfo.get('Ordering', 'unknown')) try: name = literal_name(spec['Encoding']) except KeyError: if STRICT: raise PDFFontError('Encoding is unspecified') name = 'unknown' try: self.cmap = rsrc.get_cmap(name, strict=STRICT) except CMapDB.CMapNotFound, e: raise PDFFontError(e)
def __init__(self, rsrcmgr, spec): try: self.basefont = literal_name(spec['BaseFont']) except KeyError: handle_error(PDFFontError, 'BaseFont is missing') self.basefont = 'unknown' try: (descriptor, widths) = FontMetricsDB.get_metrics(self.basefont) except KeyError: descriptor = dict_value(spec.get('FontDescriptor', {})) firstchar = int_value(spec.get('FirstChar', 0)) lastchar = int_value(spec.get('LastChar', 255)) widths = list_value(spec.get('Widths', [0] * 256)) widths = dict((i+firstchar, w) for (i, w) in enumerate(widths)) PDFSimpleFont.__init__(self, descriptor, widths, spec) if 'Encoding' not in spec and 'FontFile' in descriptor: # try to recover the missing encoding info from the font file. self.fontfile = stream_value(descriptor.get('FontFile')) length1 = int_value(self.fontfile['Length1']) data = self.fontfile.get_data()[:length1] parser = Type1FontHeaderParser(StringIO(data)) self.cid2unicode = parser.get_encoding()
def __init__(self, rsrcmgr, spec): try: self.basefont = literal_name(spec['BaseFont']) except KeyError: if STRICT: raise PDFFontError('BaseFont is missing') self.basefont = 'unknown' try: (descriptor, widths) = FontMetricsDB.get_metrics(self.basefont) except KeyError: descriptor = dict_value(spec.get('FontDescriptor', {})) firstchar = int_value(spec.get('FirstChar', 0)) lastchar = int_value(spec.get('LastChar', 255)) widths = list_value(spec.get('Widths', [0]*256)) widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths) ) PDFSimpleFont.__init__(self, descriptor, widths, spec) if 'Encoding' not in spec and 'FontFile' in descriptor: # try to recover the missing encoding info from the font file. self.fontfile = stream_value(descriptor.get('FontFile')) length1 = int_value(self.fontfile['Length1']) data = self.fontfile.get_data()[:length1] parser = Type1FontHeaderParser(StringIO(data)) self.cid2unicode = parser.get_encoding() return
def __init__(self, rsrcmgr, spec): try: self.basefont = literal_name(spec['BaseFont']) except KeyError: if STRICT: raise PDFFontError('BaseFont is missing') self.basefont = 'unknown' self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {})) self.cidcoding = '%s-%s' % (self.cidsysteminfo.get( 'Registry', 'unknown'), self.cidsysteminfo.get('Ordering', 'unknown')) try: name = literal_name(spec['Encoding']) except KeyError: if STRICT: raise PDFFontError('Encoding is unspecified') name = 'unknown' try: self.cmap = CMapDB.get_cmap(name) except CMapDB.CMapNotFound as e: if STRICT: raise PDFFontError(e) self.cmap = CMap() try: descriptor = dict_value(spec['FontDescriptor']) except KeyError: if STRICT: raise PDFFontError('FontDescriptor is missing') descriptor = {} ttf = None if 'FontFile2' in descriptor: self.fontfile = stream_value(descriptor.get('FontFile2')) ttf = TrueTypeFont(self.basefont, StringIO(self.fontfile.get_data())) self.unicode_map = None if 'ToUnicode' in spec: strm = stream_value(spec['ToUnicode']) self.unicode_map = FileUnicodeMap() CMapParser(self.unicode_map, StringIO(strm.get_data())).run() elif self.cidcoding == 'Adobe-Identity': if ttf: try: self.unicode_map = ttf.create_unicode_map() except TrueTypeFont.CMapNotFound: pass else: try: self.unicode_map = CMapDB.get_unicode_map( self.cidcoding, self.cmap.is_vertical()) except CMapDB.CMapNotFound as e: pass self.vertical = self.cmap.is_vertical() if self.vertical: # writing mode: vertical widths = get_widths2(list_value(spec.get('W2', []))) self.disps = dict( (cid, (vx, vy)) for (cid, (_, (vx, vy))) in widths.iteritems()) (vy, w) = spec.get('DW2', [880, -1000]) self.default_disp = (None, vy) widths = dict((cid, w) for (cid, (w, _)) in widths.iteritems()) default_width = w else: # writing mode: horizontal self.disps = {} self.default_disp = 0 widths = get_widths(list_value(spec.get('W', []))) default_width = spec.get('DW', 1000) PDFFont.__init__(self, descriptor, widths, default_width=default_width) return
def do_keyword(self, pos, token): """Handles PDF-related keywords.""" if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF): self.add_results(*self.pop(1)) elif token is self.KEYWORD_ENDOBJ: self.add_results(*self.pop(4)) elif token is self.KEYWORD_NULL: # null object self.push((pos, None)) elif token is self.KEYWORD_R: # reference to indirect object try: ((_,objid), (_,genno)) = self.pop(2) (objid, genno) = (int(objid), int(genno)) obj = PDFObjRef(self.doc, objid, genno) self.push((pos, obj)) except PSSyntaxError: pass elif token is self.KEYWORD_STREAM: # stream object ((_,dic),) = self.pop(1) dic = dict_value(dic) objlen = 0 if not self.fallback: try: objlen = int_value(dic['Length']) except KeyError: if STRICT: raise PDFSyntaxError('/Length is undefined: %r' % dic) self.seek(pos) try: (_, line) = self.nextline() # 'stream' except PSEOF: if STRICT: raise PDFSyntaxError('Unexpected EOF') return pos += len(line) self.fp.seek(pos) data = self.fp.read(objlen) self.seek(pos+objlen) while 1: try: (linepos, line) = self.nextline() except PSEOF: if STRICT: raise PDFSyntaxError('Unexpected EOF') break if 'endstream' in line: i = line.index('endstream') objlen += i data += line[:i] break objlen += len(line) data += line self.seek(pos+objlen) # XXX limit objlen not to exceed object boundary if 2 <= self.debug: print >>sys.stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \ (pos, objlen, dic, data[:10]) obj = PDFStream(dic, data, self.doc.decipher) self.push((pos, obj)) else: # others self.push((pos, token)) return