示例#1
0
 def getobj(self, objid):
     assert objid != 0
     if not self.xrefs:
         raise PDFException('PDFDocument is not initialized')
     if 2 <= self.debug:
         print >> sys.stderr, 'getobj: objid=%r' % (objid)
     if objid in self._cached_objs:
         (obj, genno) = self._cached_objs[objid]
     else:
         for xref in self.xrefs:
             try:
                 (strmid, index, genno) = xref.get_pos(objid)
             except KeyError:
                 continue
             try:
                 if strmid is not None:
                     stream = stream_value(self.getobj(strmid))
                     obj = self._getobj_objstm(stream, index, objid)
                 else:
                     obj = self._getobj_parse(index, objid)
                 if isinstance(obj, PDFStream):
                     obj.set_objid(objid, genno)
                 break
             except (PSEOF, PDFSyntaxError):
                 continue
         else:
             raise PDFObjectNotFound(objid)
         if 2 <= self.debug:
             print >> sys.stderr, 'register: objid=%r: %r' % (objid, obj)
         if self.caching:
             self._cached_objs[objid] = (obj, genno)
     if self.decipher:
         obj = decipher_all(self.decipher, objid, genno, obj)
     return obj
示例#2
0
 def get_pages(self):
     if not self.xrefs:
         raise PDFException('PDFDocument is not initialized')
     def search(obj, parent):
         if isinstance(obj, int):
             objid = obj
             tree = dict_value(self.getobj(objid)).copy()
         else:
             objid = obj.objid
             tree = dict_value(obj).copy()
         for (k,v) in parent.iteritems():
             if k in self.INHERITABLE_ATTRS and k not in tree:
                 tree[k] = v
         if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
             if 1 <= self.debug:
                 print >>sys.stderr, 'Pages: Kids=%r' % tree['Kids']
             for c in list_value(tree['Kids']):
                 for x in search(c, tree):
                     yield x
         elif tree.get('Type') is LITERAL_PAGE:
             if 1 <= self.debug:
                 print >>sys.stderr, 'Page: %r' % tree
             yield (objid, tree)
     if 'Pages' not in self.catalog: return
     for (pageid,tree) in search(self.catalog['Pages'], self.catalog):
         yield PDFPage(self, pageid, tree)
     return
示例#3
0
    def get_outlines(self):
        if 'Outlines' not in self.catalog:
            raise PDFException('No /Outlines defined!')

        def search(entry, level):
            entry = dict_value(entry)
            if 'Title' in entry:
                if 'A' in entry or 'Dest' in entry:
                    title = decode_text(str_value(entry['Title']))
                    dest = entry.get('Dest')
                    action = entry.get('A')
                    se = entry.get('SE')
                    yield (level, title, dest, action, se)
            if 'First' in entry and 'Last' in entry:
                for x in search(entry['First'], level + 1):
                    yield x
            if 'Next' in entry:
                for x in search(entry['Next'], level):
                    yield x
            return

        return search(self.catalog['Outlines'], 0)
示例#4
0
 def getobj(self, objid):
     if not self.xrefs:
         raise PDFException('PDFDocument is not initialized')
     if 2 <= self.debug:
         print >>sys.stderr, 'getobj: objid=%r' % (objid)
     if objid in self._cached_objs:
         genno = 0
         obj = self._cached_objs[objid]
     else:
         for xref in self.xrefs:
             try:
                 (strmid, index) = xref.get_pos(objid)
                 break
             except KeyError:
                 pass
         else:
             if STRICT:
                 raise PDFSyntaxError('Cannot locate objid=%r' % objid)
             # return null for a nonexistent reference.
             return None
         if strmid:
             stream = stream_value(self.getobj(strmid))
             if stream.get('Type') is not LITERAL_OBJSTM:
                 if STRICT:
                     raise PDFSyntaxError('Not a stream object: %r' % stream)
             try:
                 n = stream['N']
             except KeyError:
                 if STRICT:
                     raise PDFSyntaxError('N is not defined: %r' % stream)
                 n = 0
             if strmid in self._parsed_objs:
                 objs = self._parsed_objs[strmid]
             else:
                 parser = PDFStreamParser(stream.get_data())
                 parser.set_document(self)
                 objs = []
                 try:
                     while 1:
                         (_,obj) = parser.nextobject()
                         objs.append(obj)
                 except PSEOF:
                     pass
                 if self.caching:
                     self._parsed_objs[strmid] = objs
             genno = 0
             i = n*2+index
             try:
                 obj = objs[i]
             except IndexError:
                 if STRICT:
                     raise PDFSyntaxError('Invalid object number: objid=%r' % (objid))
                 # return None for an invalid object number
                 return None
             if isinstance(obj, PDFStream):
                 obj.set_objid(objid, 0)
         else:
             self._parser.seek(index)
             (_,objid1) = self._parser.nexttoken() # objid
             (_,genno) = self._parser.nexttoken() # genno
             (_,kwd) = self._parser.nexttoken()
             # #### hack around malformed pdf files
             #assert objid1 == objid, (objid, objid1)
             if objid1 != objid:
                 x = []
                 while kwd is not self.KEYWORD_OBJ:
                     (_,kwd) = self._parser.nexttoken()
                     x.append(kwd)
                 if x:
                     objid1 = x[-2]
                     genno = x[-1]
             # #### end hack around malformed pdf files
             if kwd is not self.KEYWORD_OBJ:
                 raise PDFSyntaxError('Invalid object spec: offset=%r' % index)
             try:
                 (_,obj) = self._parser.nextobject()
                 if isinstance(obj, PDFStream):
                     obj.set_objid(objid, genno)
             except PSEOF:
                 return None
         if 2 <= self.debug:
             print >>sys.stderr, 'register: objid=%r: %r' % (objid, obj)
         if self.caching:
             self._cached_objs[objid] = obj
     if self.decipher:
         obj = decipher_all(self.decipher, objid, genno, obj)
     return obj