Пример #1
0
 def set_parser(self, parser):
     "Set the document to use a given PDFParser object."
     if self._parser: return
     self._parser = parser
     # Retrieve the information of each header that was appended
     # (maybe multiple times) at the end of the document.
     self.xrefs = parser.read_xref()
     for xref in self.xrefs:
         trailer = xref.get_trailer()
         if not trailer: continue
         # If there's an encryption info, remember it.
         if 'Encrypt' in trailer:
             #assert not self.encryption
             self.encryption = (list_value(trailer['ID']),
                                dict_value(trailer['Encrypt']))
         if 'Info' in trailer:
             self.info.append(dict_value(trailer['Info']))
         if 'Root' in trailer:
             #  Every PDF file must have exactly one /Root dictionary.
             self.catalog = dict_value(trailer['Root'])
             break
     else:
         raise PDFSyntaxError('No /Root object! - Is this really a PDF?')
     if self.catalog.get('Type') is not LITERAL_CATALOG:
         if STRICT:
             raise PDFSyntaxError('Catalog not found!')
     return
Пример #2
0
 def init_resources(self, resources):
     self.fontmap = {}
     self.xobjmap = {}
     self.csmap = PREDEFINED_COLORSPACE.copy()
     if not resources: return
     def get_colorspace(spec):
         if isinstance(spec, list):
             name = literal_name(spec[0])
         else:
             name = literal_name(spec)
         if name == 'ICCBased' and isinstance(spec, list) and 2 <= len(spec):
             return PDFColorSpace(name, stream_value(spec[1])['N'])
         elif name == 'DeviceN' and isinstance(spec, list) and 2 <= len(spec):
             return PDFColorSpace(name, len(list_value(spec[1])))
         else:
             return PREDEFINED_COLORSPACE[name]
     for (k,v) in dict_value(resources).iteritems():
         if 1 <= self.debug:
             print >>stderr, 'Resource: %r: %r' % (k,v)
         if k == 'Font':
             for (fontid,spec) in dict_value(v).iteritems():
                 objid = None
                 if isinstance(spec, PDFObjRef):
                     objid = spec.objid
                 spec = dict_value(spec)
                 self.fontmap[fontid] = self.rsrc.get_font(objid, spec)
         elif k == 'ColorSpace':
             for (csid,spec) in dict_value(v).iteritems():
                 self.csmap[csid] = get_colorspace(resolve1(spec))
         elif k == 'ProcSet':
             self.rsrc.get_procset(list_value(v))
         elif k == 'XObject':
             for (xobjid,xobjstrm) in dict_value(v).iteritems():
                 self.xobjmap[xobjid] = xobjstrm
     return
Пример #3
0
    def lookup_name(self, cat, key):
        try:
            names = dict_value(self.catalog['Names'])
        except (PDFTypeError, KeyError):
            raise KeyError((cat, key))
        # may raise KeyError
        d0 = dict_value(names[cat])

        def lookup(d):
            if 'Limits' in d:
                (k1, k2) = list_value(d['Limits'])
                if key < k1 or k2 < key:
                    return None
            if 'Names' in d:
                objs = list_value(d['Names'])
                names = dict(choplist(2, objs))
                return names[key]
            if 'Kids' in d:
                for c in list_value(d['Kids']):
                    v = lookup(dict_value(c))
                    if v:
                        return v
            raise KeyError((cat, key))

        return lookup(d0)
Пример #4
0
 def set_parser(self, parser):
     "Set the document to use a given PDFParser object."
     if self.parser: return
     self.parser = parser
     # The document is set to be temporarily ready during collecting
     # all the basic information about the document, e.g.
     # the header, the encryption information, and the access rights
     # for the document.
     self._initialized = True
     # Retrieve the information of each header that was appended
     # (maybe multiple times) at the end of the document.
     self.xrefs = parser.read_xref()
     for xref in self.xrefs:
         trailer = xref.get_trailer()
         if not trailer: continue
         # If there's an encryption info, remember it.
         if 'Encrypt' in trailer:
             #assert not self.encryption
             self.encryption = (list_value(trailer['ID']),
                                dict_value(trailer['Encrypt']))
         if 'Root' in trailer:
             self.set_root(dict_value(trailer['Root']))
             break
     else:
         raise PDFSyntaxError('No /Root object! - Is this really a PDF?')
     # The document is set to be non-ready again, until all the
     # proper initialization (asking the password key and
     # verifying the access permission, so on) is finished.
     self._initialized = False
     return
Пример #5
0
 def init_resources(self, resources):
     self.fontmap = {}
     self.xobjmap = {}
     self.csmap = PREDEFINED_COLORSPACE.copy()
     if not resources: return
     def get_colorspace(spec):
         if isinstance(spec, list):
             name = literal_name(spec[0])
         else:
             name = literal_name(spec)
         if name == 'ICCBased' and isinstance(spec, list) and 2 <= len(spec):
             return PDFColorSpace(name, stream_value(spec[1])['N'])
         elif name == 'DeviceN' and isinstance(spec, list) and 2 <= len(spec):
             return PDFColorSpace(name, len(list_value(spec[1])))
         else:
             return PREDEFINED_COLORSPACE[name]
     for (k,v) in dict_value(resources).iteritems():
         if 1 <= self.debug:
             print >>stderr, 'Resource: %r: %r' % (k,v)
         if k == 'Font':
             for (fontid,spec) in dict_value(v).iteritems():
                 objid = None
                 if isinstance(spec, PDFObjRef):
                     objid = spec.objid
                 spec = dict_value(spec)
                 self.fontmap[fontid] = self.rsrc.get_font(objid, spec)
         elif k == 'ColorSpace':
             for (csid,spec) in dict_value(v).iteritems():
                 self.csmap[csid] = get_colorspace(resolve1(spec))
         elif k == 'ProcSet':
             self.rsrc.get_procset(list_value(v))
         elif k == 'XObject':
             for (xobjid,xobjstrm) in dict_value(v).iteritems():
                 self.xobjmap[xobjid] = xobjstrm
     return
Пример #6
0
 def set_parser(self, parser):
     "Set the document to use a given PDFParser object."
     if self.parser: return
     self.parser = parser
     # The document is set to be temporarily ready during collecting
     # all the basic information about the document, e.g.
     # the header, the encryption information, and the access rights
     # for the document.
     self._initialized = True
     # Retrieve the information of each header that was appended
     # (maybe multiple times) at the end of the document.
     self.xrefs = parser.read_xref()
     for xref in self.xrefs:
         trailer = xref.get_trailer()
         if not trailer: continue
         # If there's an encryption info, remember it.
         if 'Encrypt' in trailer:
             #assert not self.encryption
             self.encryption = (list_value(trailer['ID']),
                                dict_value(trailer['Encrypt']))
         if 'Root' in trailer:
             self.set_root(dict_value(trailer['Root']))
             break
     else:
         raise PDFSyntaxError('No /Root object! - Is this really a PDF?')
     # The document is set to be non-ready again, until all the
     # proper initialization (asking the password key and
     # verifying the access permission, so on) is finished.
     self._initialized = False
     return
Пример #7
0
 def __init__(self, parser, password='', caching=True, fallback=True, dbg=False):
     if dbg:
         print 'PDFDocument() debugging enabled'
         debug = 3
     "Set the document to use a given PDFParser object."
     self.caching = caching
     self.eof_distance = 0
     self.found_eof = False
     self.xrefs = []
     self.info = []
     self.errors = []
     self.catalog = None
     self.encryption = None
     self.decipher = None
     self._parser = None
     self._cached_objs = {}
     self._parsed_objs = {}
     self._parser = parser
     self._parser.set_document(self)
     self.is_printable = self.is_modifiable = self.is_extractable = True
     # Retrieve the information of each header that was appended
     # (maybe multiple times) at the end of the document.
     try:
         pos = self.find_xref(parser)
         self.read_xref_from(parser, pos, self.xrefs)
     except PDFNoValidXRef:
         fallback = True
     except PSEOF:
         self.errors.append(
             "<PDFMiner><PDFDocument><__init__>Error reading xref table: Possible malformed table</__init__></PDFDocument></PDFMiner>")
     if fallback:
         parser.fallback = True
         xref = PDFXRefFallback()
         xref.load(parser)
         self.xrefs.append(xref)
     for xref in self.xrefs:
         trailer = xref.get_trailer()
         if not trailer:
             continue
         # If there's an encryption info, remember it.
         if 'Encrypt' in trailer:
             # assert not self.encryption
             self.encryption = (list_value(trailer['ID']),
                                dict_value(trailer['Encrypt']))
             self._initialize_password(password)
         if 'Info' in trailer:
             self.info.append(dict_value(trailer['Info']))
         if 'Root' in trailer:
             # Every PDF file must have exactly one /Root dictionary.
             self.catalog = dict_value(trailer['Root'])
             break
             # else:
             # raise PDFSyntaxError('No /Root object! - Is this really a PDF?')
     if self.catalog and self.catalog.get('Type') is not LITERAL_CATALOG:
         if STRICT:
             raise PDFSyntaxError('Catalog not found!')
     return
Пример #8
0
 def do_Do(self, xobjid):
     xobjid = literal_name(xobjid)
     try:
         xobj = stream_value(self.xobjmap[xobjid])
     except KeyError:
         if STRICT:
             raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
         return
     if 1 <= self.debug:
         print >> stderr, 'Processing xobj: %r' % xobj
     subtype = xobj.dic.get('Subtype')
     if subtype is LITERAL_FORM and 'BBox' in xobj.dic:
         interpreter = self.dup()
         (x0, y0, x1, y1) = list_value(xobj.dic['BBox'])
         ctm = mult_matrix(
             list_value(xobj.dic.get('Matrix', MATRIX_IDENTITY)), self.ctm)
         (x0, y0) = apply_matrix(ctm, (x0, y0))
         (x1, y1) = apply_matrix(ctm, (x1, y1))
         bbox = (x0, y0, x1, y1)
         self.device.begin_figure(xobjid, bbox)
         interpreter.render_contents(dict_value(xobj.dic.get('Resources')),
                                     [xobj],
                                     ctm=ctm)
         self.device.end_figure(xobjid)
     elif subtype is LITERAL_IMAGE and 'Width' in xobj.dic and 'Height' in xobj.dic:
         (x0, y0) = apply_matrix(self.ctm, (0, 0))
         (x1, y1) = apply_matrix(self.ctm, (1, 1))
         self.device.begin_figure(xobjid, (x0, y0, x1, y1))
         (w, h) = (xobj.dic['Width'], xobj.dic['Height'])
         self.device.render_image(xobj, (w, h), self.ctm)
         self.device.end_figure(xobjid)
     else:
         # unsupported xobject type.
         pass
     return
Пример #9
0
 def do_Do(self, xobjid):
     xobjid = literal_name(xobjid)
     try:
         xobj = stream_value(self.xobjmap[xobjid])
     except KeyError:
         if STRICT:
             raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
         return
     if 1 <= self.debug:
         print >>stderr, 'Processing xobj: %r' % xobj
     subtype = xobj.get('Subtype')
     if subtype is LITERAL_FORM and 'BBox' in xobj:
         interpreter = self.dup()
         bbox = list_value(xobj['BBox'])
         matrix = list_value(xobj.get('Matrix', MATRIX_IDENTITY))
         self.device.begin_figure(xobjid, bbox, matrix)
         interpreter.render_contents(dict_value(xobj.get('Resources')), [xobj], ctm=mult_matrix(matrix, self.ctm))
         self.device.end_figure(xobjid)
     elif subtype is LITERAL_IMAGE and 'Width' in xobj and 'Height' in xobj:
         self.device.begin_figure(xobjid, (0,0,1,1), MATRIX_IDENTITY)
         self.device.render_image(xobjid, xobj)
         self.device.end_figure(xobjid)
     else:
         # unsupported xobject type.
         pass
     return
Пример #10
0
 def set_root(self, root):
     self.root = root
     self.catalog = dict_value(self.root)
     if self.catalog.get('Type') is not LITERAL_CATALOG:
         if STRICT:
             raise PDFSyntaxError('Catalog not found!')
     return
Пример #11
0
 def __init__(self, doc, pageid, attrs):
     """Initialize a page object.
     
     doc: a PDFDocument object.
     pageid: any Python object that can uniquely identify the page.
     attrs: a dictionary of page attributes.
     """
     self.doc = doc
     self.pageid = pageid
     self.attrs = dict_value(attrs)
     self.lastmod = resolve1(self.attrs.get('LastModified'))
     self.resources = resolve1(self.attrs['Resources'])
     self.mediabox = resolve1(self.attrs['MediaBox'])
     if 'CropBox' in self.attrs:
         self.cropbox = resolve1(self.attrs['CropBox'])
     else:
         self.cropbox = self.mediabox
     self.rotate = (self.attrs.get('Rotate', 0)+360) % 360
     self.annots = self.attrs.get('Annots')
     self.beads = self.attrs.get('B')
     if 'Contents' in self.attrs:
         contents = resolve1(self.attrs['Contents'])
     else:
         contents = []
     if not isinstance(contents, list):
         contents = [ contents ]
     self.contents = contents
     return
Пример #12
0
 def do_Do(self, xobjid):
     xobjid = literal_name(xobjid)
     try:
         xobj = stream_value(self.xobjmap[xobjid])
     except KeyError:
         if STRICT:
             raise PDFInterpreterError("Undefined xobject id: %r" % xobjid)
         return
     if 1 <= self.debug:
         print >> stderr, "Processing xobj: %r" % xobj
     subtype = xobj.dic.get("Subtype")
     if subtype is LITERAL_FORM and "BBox" in xobj.dic:
         interpreter = self.dup()
         (x0, y0, x1, y1) = list_value(xobj.dic["BBox"])
         ctm = mult_matrix(list_value(xobj.dic.get("Matrix", MATRIX_IDENTITY)), self.ctm)
         (x0, y0) = apply_matrix(ctm, (x0, y0))
         (x1, y1) = apply_matrix(ctm, (x1, y1))
         bbox = (x0, y0, x1, y1)
         self.device.begin_figure(xobjid, bbox)
         interpreter.render_contents(dict_value(xobj.dic.get("Resources")), [xobj], ctm=ctm)
         self.device.end_figure(xobjid)
     elif subtype is LITERAL_IMAGE and "Width" in xobj.dic and "Height" in xobj.dic:
         (x0, y0) = apply_matrix(self.ctm, (0, 0))
         (x1, y1) = apply_matrix(self.ctm, (1, 1))
         self.device.begin_figure(xobjid, (x0, y0, x1, y1))
         (w, h) = (xobj.dic["Width"], xobj.dic["Height"])
         self.device.render_image(xobj, (w, h), self.ctm)
         self.device.end_figure(xobjid)
     else:
         # unsupported xobject type.
         pass
     return
 def do_Do(self, xobjid):
     xobjid = literal_name(xobjid)
     try:
         xobj = stream_value(self.xobjmap[xobjid])
     except KeyError:
         if STRICT:
             raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
         return
     if 1 <= self.debug:
         print >>sys.stderr, 'Processing xobj: %r' % xobj
     subtype = xobj.get('Subtype')
     if subtype is LITERAL_FORM and 'BBox' in xobj:
         interpreter = self.dup()
         bbox = list_value(xobj['BBox'])
         matrix = list_value(xobj.get('Matrix', MATRIX_IDENTITY))
         # According to PDF reference 1.7 section 4.9.1, XObjects in 
         # earlier PDFs (prior to v1.2) use the page's Resources entry
         # instead of having their own Resources entry.
         resources = dict_value(xobj.get('Resources')) or self.resources.copy()
         self.device.begin_figure(xobjid, bbox, matrix)
         interpreter.render_contents(resources, [xobj], ctm=mult_matrix(matrix, self.ctm))
         self.device.end_figure(xobjid)
     elif subtype is LITERAL_IMAGE and 'Width' in xobj and 'Height' in xobj:
         self.device.begin_figure(xobjid, (0,0,1,1), MATRIX_IDENTITY)
         self.device.render_image(xobjid, xobj)
         self.device.end_figure(xobjid)
     else:
         # unsupported xobject type.
         pass
     return
Пример #14
0
 def do_Do(self, xobjid):
     xobjid = literal_name(xobjid)
     try:
         xobj = stream_value(self.xobjmap[xobjid])
     except KeyError:
         if STRICT:
             raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
         return
     if 1 <= self.debug:
         print >>stderr, 'Processing xobj: %r' % xobj
     subtype = xobj.get('Subtype')
     if subtype is LITERAL_FORM and 'BBox' in xobj:
         interpreter = self.dup()
         bbox = list_value(xobj['BBox'])
         matrix = list_value(xobj.get('Matrix', MATRIX_IDENTITY))
         self.device.begin_figure(xobjid, bbox, matrix)
         interpreter.render_contents(dict_value(xobj.get('Resources')), [xobj], ctm=mult_matrix(matrix, self.ctm))
         self.device.end_figure(xobjid)
     elif subtype is LITERAL_IMAGE and 'Width' in xobj and 'Height' in xobj:
         self.device.begin_figure(xobjid, (0,0,1,1), MATRIX_IDENTITY)
         self.device.render_image(xobjid, xobj)
         self.device.end_figure(xobjid)
     else:
         # unsupported xobject type.
         pass
     return
Пример #15
0
    def __init__(self, doc, pageid, attrs):
        """Initialize a page object.

        doc: a PDFDocument object.
        pageid: any Python object that can uniquely identify the page.
        attrs: a dictionary of page attributes.
        """
        self.doc = doc
        self.pageid = pageid
        self.attrs = dict_value(attrs)
        self.lastmod = resolve1(self.attrs.get("LastModified"))
        self.resources = resolve1(self.attrs["Resources"])
        self.mediabox = resolve1(self.attrs["MediaBox"])
        if "CropBox" in self.attrs:
            self.cropbox = resolve1(self.attrs["CropBox"])
        else:
            self.cropbox = self.mediabox
        self.rotate = (int_value(self.attrs.get("Rotate", 0)) + 360) % 360
        self.annots = self.attrs.get("Annots")
        self.beads = self.attrs.get("B")
        if "Contents" in self.attrs:
            contents = resolve1(self.attrs["Contents"])
        else:
            contents = []
        if not isinstance(contents, list):
            contents = [contents]
        self.contents = contents
        return
Пример #16
0
 def do_Do(self, xobjid):
     xobjid = literal_name(xobjid)
     try:
         xobj = stream_value(self.xobjmap[xobjid])
     except KeyError:
         if STRICT:
             raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
         return
     if 1 <= self.debug:
         print >>sys.stderr, 'Processing xobj: %r' % xobj
     subtype = xobj.get('Subtype')
     if subtype is LITERAL_FORM and 'BBox' in xobj:
         interpreter = self.dup()
         bbox = list_value(xobj['BBox'])
         matrix = list_value(xobj.get('Matrix', MATRIX_IDENTITY))
         # According to PDF reference 1.7 section 4.9.1, XObjects in 
         # earlier PDFs (prior to v1.2) use the page's Resources entry
         # instead of having their own Resources entry.
         resources = dict_value(xobj.get('Resources')) or self.resources.copy()
         self.device.begin_figure(xobjid, bbox, matrix)
         interpreter.render_contents(resources, [xobj], ctm=mult_matrix(matrix, self.ctm))
         self.device.end_figure(xobjid)
     elif subtype is LITERAL_IMAGE and 'Width' in xobj and 'Height' in xobj:
         self.device.begin_figure(xobjid, (0,0,1,1), MATRIX_IDENTITY)
         self.device.render_image(xobjid, xobj)
         self.device.end_figure(xobjid)
     else:
         # unsupported xobject type.
         pass
     return
Пример #17
0
 def set_root(self, root):
     self.root = root
     self.catalog = dict_value(self.root)
     if self.catalog.get('Type') is not LITERAL_CATALOG:
         if STRICT:
             raise PDFSyntaxError('Catalog not found!')
     return
Пример #18
0
 def do_keyword(self, pos, token):
     """Handles PDF-related keywords."""
     if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
         self.add_results(*self.pop(1))
     elif token is self.KEYWORD_ENDOBJ:
         self.add_results(*self.pop(4))
     elif token is self.KEYWORD_NULL:
         # null object
         self.push((pos, None))
     elif token is self.KEYWORD_R:
         # reference to indirect object
         try:
             ((_, objid), (_, genno)) = self.pop(2)
             (objid, genno) = (int(objid), int(genno))
             obj = PDFObjRef(self.doc, objid, genno)
             self.push((pos, obj))
         except PSSyntaxError:
             pass
     elif token is self.KEYWORD_STREAM:
         # stream object
         ((_, dic),) = self.pop(1)
         dic = dict_value(dic)
         objlen = 0
         if not self.fallback:
             try:
                 objlen = int_value(dic['Length'])
             except KeyError:
                 handle_error(PDFSyntaxError, '/Length is undefined: %r' % dic)
         self.seek(pos)
         try:
             (_, line) = self.nextline()  # 'stream'
         except PSEOF:
             handle_error(PDFSyntaxError, 'Unexpected EOF')
             return
         pos += len(line)
         self.fp.seek(pos)
         data = self.fp.read(objlen)
         self.seek(pos + objlen)
         while 1:
             try:
                 (linepos, line) = self.nextline()
             except PSEOF:
                 handle_error(PDFSyntaxError, 'Unexpected EOF')
                 break
             if 'endstream' in line:
                 i = line.index('endstream')
                 objlen += i
                 data += line[:i]
                 break
             objlen += len(line)
             data += line
         self.seek(pos+objlen)
         # XXX limit objlen not to exceed object boundary
         log.debug('Stream: pos=%d, objlen=%d, dic=%r, data=%r...', pos, objlen, dic, data[:10])
         obj = PDFStream(dic, data, self.doc.decipher)
         self.push((pos, obj))
     else:
         # others
         self.push((pos, token))
Пример #19
0
 def __init__(self, parser, caching=True, fallback=True):
     "Set the document to use a given PDFParser object."
     self.caching = caching
     self.xrefs = []
     self.info = []
     self.catalog = None
     self.encryption = None
     self.decipher = None
     self._parser = None
     self._cached_objs = {}
     self._parsed_objs = {}
     self._parser = parser
     self._parser.set_document(self)
     # Retrieve the information of each header that was appended
     # (maybe multiple times) at the end of the document.
     try:
         pos = self.find_xref(parser)
         self.read_xref_from(parser, pos, self.xrefs)
     except PDFNoValidXRef:
         fallback = True
     if fallback:
         parser.fallback = True
         xref = PDFXRefFallback()
         xref.load(parser)
         self.xrefs.append(xref)
     for xref in self.xrefs:
         trailer = xref.get_trailer()
         if not trailer:
             continue
         # If there's an encryption info, remember it.
         if 'Encrypt' in trailer:
             #assert not self.encryption
             self.encryption = (list_value(trailer['ID']),
                                dict_value(trailer['Encrypt']))
         if 'Info' in trailer:
             self.info.append(dict_value(trailer['Info']))
         if 'Root' in trailer:
             # Every PDF file must have exactly one /Root dictionary.
             self.catalog = dict_value(trailer['Root'])
             break
     else:
         raise PDFSyntaxError('No /Root object! - Is this really a PDF?')
     if self.catalog.get('Type') is not LITERAL_CATALOG:
         if STRICT:
             raise PDFSyntaxError('Catalog not found!')
     return
Пример #20
0
 def __init__(self, parser, caching=True, fallback=True):
     "Set the document to use a given PDFParser object."
     self.caching = caching
     self.xrefs = []
     self.info = []
     self.catalog = None
     self.encryption = None
     self.decipher = None
     self._parser = None
     self._cached_objs = {}
     self._parsed_objs = {}
     self._parser = parser
     self._parser.set_document(self)
     # Retrieve the information of each header that was appended
     # (maybe multiple times) at the end of the document.
     try:
         pos = self.find_xref(parser)
         self.read_xref_from(parser, pos, self.xrefs)
     except PDFNoValidXRef:
         fallback = True
     if fallback:
         parser.fallback = True
         xref = PDFXRefFallback()
         xref.load(parser)
         self.xrefs.append(xref)
     for xref in self.xrefs:
         trailer = xref.get_trailer()
         if not trailer:
             continue
         # If there's an encryption info, remember it.
         if 'Encrypt' in trailer:
             #assert not self.encryption
             self.encryption = (list_value(trailer['ID']),
                                dict_value(trailer['Encrypt']))
         if 'Info' in trailer:
             self.info.append(dict_value(trailer['Info']))
         if 'Root' in trailer:
             # Every PDF file must have exactly one /Root dictionary.
             self.catalog = dict_value(trailer['Root'])
             break
     else:
         raise PDFSyntaxError('No /Root object! - Is this really a PDF?')
     if self.catalog.get('Type') is not LITERAL_CATALOG:
         if STRICT:
             raise PDFSyntaxError('Catalog not found!')
     return
Пример #21
0
 def search(obj, parent):
     if isinstance(obj, int):
         objid = obj
         tree = dict_value(document.getobj(objid)).copy()
     else:
         objid = obj.objid
         tree = dict_value(obj).copy()
     for (k, v) in parent.iteritems():
         if k in klass.INHERITABLE_ATTRS and k not in tree:
             tree[k] = v
     if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
         logging.info('Pages: Kids=%r' % tree['Kids'])
         for c in list_value(tree['Kids']):
             for x in search(c, tree):
                 yield x
     elif tree.get('Type') is LITERAL_PAGE:
         logging.info('Page: %r' % tree)
         yield (objid, tree)
Пример #22
0
 def search(obj, parent):
     if isinstance(obj, int):
         objid = obj
         tree = dict_value(self.getobj(objid)).copy()
     else:
         objid = obj.objid
         tree = dict_value(obj).copy()
     for (k,v) in parent.iteritems():
         if k in self.INHERITABLE_ATTRS and k not in tree:
             tree[k] = v
     if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
         if 1 <= self.debug:
             print >>sys.stderr, 'Pages: Kids=%r' % tree['Kids']
         for c in list_value(tree['Kids']):
             for x in search(c, tree):
                 yield x
     elif tree.get('Type') is LITERAL_PAGE:
         if 1 <= self.debug:
             print >>sys.stderr, 'Page: %r' % tree
         yield (objid, tree)
Пример #23
0
 def load_trailer(self, parser):
     try:
         (_, kwd) = parser.nexttoken()
         assert kwd is self.KEYWORD_TRAILER
         (_, dic) = parser.nextobject()
     except PSEOF:
         x = parser.pop(1)
         if not x:
             raise PDFNoValidXRef('Unexpected EOF - file corrupted')
         (_, dic) = x[0]
     self.trailer.update(dict_value(dic))
Пример #24
0
 def search(obj, parent):
     if isinstance(obj, int):
         objid = obj
         tree = dict_value(document.getobj(objid)).copy()
     else:
         objid = obj.objid
         tree = dict_value(obj).copy()
     for (k, v) in parent.iteritems():
         if k in klass.INHERITABLE_ATTRS and k not in tree:
             tree[k] = v
     if tree.get("Type") is LITERAL_PAGES and "Kids" in tree:
         if 1 <= debug:
             print >>sys.stderr, "Pages: Kids=%r" % tree["Kids"]
         for c in list_value(tree["Kids"]):
             for x in search(c, tree):
                 yield x
     elif tree.get("Type") is LITERAL_PAGE:
         if 1 <= debug:
             print >>sys.stderr, "Page: %r" % tree
         yield (objid, tree)
Пример #25
0
 def load_trailer(self, parser):
     try:
         (_, kwd) = parser.nexttoken()
         assert kwd is self.KEYWORD_TRAILER
         (_, dic) = parser.nextobject()
     except PSEOF:
         x = parser.pop(1)
         if not x:
             raise PDFNoValidXRef('Unexpected EOF - file corrupted')
         (_, dic) = x[0]
     self.trailer.update(dict_value(dic))
     return
Пример #26
0
class PDFCIDFont(PDFFont):
    def __init__(self, rsrcmgr, spec):
        try:
            self.basefont = literal_name(spec['BaseFont'])
        except KeyError:
            if STRICT:
                raise PDFFontError('BaseFont is missing')
            self.basefont = 'unknown'
        self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
        self.cidcoding = '%s-%s' % (self.cidsysteminfo.get(
            'Registry',
            'unknown'), self.cidsysteminfo.get('Ordering', 'unknown'))
        try:
            name = literal_name(spec['Encoding'])
        except KeyError:
            if STRICT:
                raise PDFFontError('Encoding is unspecified')
            name = 'unknown'
        try:
            self.cmap = CMapDB.get_cmap(name)
        except CMapDB.CMapNotFound, e:
            if STRICT:
                raise PDFFontError(e)
            self.cmap = CMap()
        try:
            descriptor = dict_value(spec['FontDescriptor'])
        except KeyError:
            if STRICT:
                raise PDFFontError('FontDescriptor is missing')
            descriptor = {}
        ttf = None
        if 'FontFile2' in descriptor:
            self.fontfile = stream_value(descriptor.get('FontFile2'))
            ttf = TrueTypeFont(self.basefont,
                               StringIO(self.fontfile.get_data()))
        self.unicode_map = None
        if 'ToUnicode' in spec:
            strm = stream_value(spec['ToUnicode'])
            self.unicode_map = FileUnicodeMap()
            CMapParser(self.unicode_map, StringIO(strm.get_data())).run()
        elif self.cidcoding in ('Adobe-Identity', 'Adobe-UCS'):
            if ttf:
                try:
                    self.unicode_map = ttf.create_unicode_map()
                except TrueTypeFont.CMapNotFound:
                    pass
        else:
            try:
                self.unicode_map = CMapDB.get_unicode_map(
                    self.cidcoding, self.cmap.is_vertical())
            except CMapDB.CMapNotFound, e:
                pass
Пример #27
0
    def init_resources(self, resources):
        self.resources = resources
        self.fontmap = {}
        self.xobjmap = {}
        self.csmap = PREDEFINED_COLORSPACE.copy()
        if not resources:
            return

        def get_colorspace(spec):
            if isinstance(spec, list):
                name = literal_name(spec[0])
            else:
                name = literal_name(spec)
            if name == "ICCBased" and isinstance(spec, list) and 2 <= len(spec):
                return PDFColorSpace(name, stream_value(spec[1])["N"])
            elif name == "DeviceN" and isinstance(spec, list) and 2 <= len(spec):
                return PDFColorSpace(name, len(list_value(spec[1])))
            else:
                return PREDEFINED_COLORSPACE.get(name)

        for (k, v) in dict_value(resources).iteritems():
            if 2 <= self.debug:
                print >>sys.stderr, "Resource: %r: %r" % (k, v)
            if k == "Font":
                for (fontid, spec) in dict_value(v).iteritems():
                    objid = None
                    if isinstance(spec, PDFObjRef):
                        objid = spec.objid
                    spec = dict_value(spec)
                    self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
            elif k == "ColorSpace":
                for (csid, spec) in dict_value(v).iteritems():
                    self.csmap[csid] = get_colorspace(resolve1(spec))
            elif k == "ProcSet":
                self.rsrcmgr.get_procset(list_value(v))
            elif k == "XObject":
                for (xobjid, xobjstrm) in dict_value(v).iteritems():
                    self.xobjmap[xobjid] = xobjstrm
        return
Пример #28
0
 def get_dest(self, name):
     try:
         # PDF-1.2 or later
         obj = self.lookup_name('Dests', name)
     except KeyError:
         # PDF-1.1 or prior
         if 'Dests' not in self.catalog:
             raise PDFDestinationNotFound(name)
         d0 = dict_value(self.catalog['Dests'])
         if name not in d0:
             raise PDFDestinationNotFound(name)
         obj = d0[name]
     return obj
Пример #29
0
 def lookup(d):
     if 'Limits' in d:
         (k1,k2) = list_value(d['Limits'])
         if key < k1 or k2 < key: return None
         if 'Names' in d:
             objs = list_value(d['Names'])
             names = dict(choplist(2, objs))
             return names[key]
     if 'Kids' in d:
         for c in list_value(d['Kids']):
             v = lookup(dict_value(c))
             if v: return v
     raise KeyError((cat,key))
Пример #30
0
 def lookup(d):
     if 'Limits' in d:
         (k1, k2) = list_value(d['Limits'])
         if key < k1 or k2 < key: return None
     if 'Names' in d:
         objs = list_value(d['Names'])
         names = dict(choplist(2, objs))
         return names[key]
     if 'Kids' in d:
         for c in list_value(d['Kids']):
             v = lookup(dict_value(c))
             if v: return v
     raise KeyError((cat, key))
Пример #31
0
 def get_dest(self, name):
     try:
         # PDF-1.2 or later
         obj = self.lookup_name('Dests', name)
     except KeyError:
         # PDF-1.1 or prior
         if 'Dests' not in self.catalog:
             raise PDFDestinationNotFound(name)
         d0 = dict_value(self.catalog['Dests'])
         if name not in d0:
             raise PDFDestinationNotFound(name)
         obj = d0[name]
     return obj
Пример #32
0
 def __init__(self, rsrcmgr, spec):
     firstchar = int_value(spec.get("FirstChar", 0))
     lastchar = int_value(spec.get("LastChar", 0))
     widths = list_value(spec.get("Widths", [0] * 256))
     widths = dict((i + firstchar, w) for (i, w) in enumerate(widths))
     if "FontDescriptor" in spec:
         descriptor = dict_value(spec["FontDescriptor"])
     else:
         descriptor = {"Ascent": 0, "Descent": 0, "FontBBox": spec["FontBBox"]}
     PDFSimpleFont.__init__(self, descriptor, widths, spec)
     self.matrix = tuple(list_value(spec.get("FontMatrix")))
     (_, self.descent, _, self.ascent) = self.bbox
     (self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1))
     return
Пример #33
0
 def __init__(self, rsrcmgr, spec):
     firstchar = int_value(spec.get('FirstChar', 0))
     lastchar = int_value(spec.get('LastChar', 0))
     widths = list_value(spec.get('Widths', [0] * 256))
     widths = dict((i + firstchar, w) for (i, w) in enumerate(widths))
     if 'FontDescriptor' in spec:
         descriptor = dict_value(spec['FontDescriptor'])
     else:
         descriptor = {'Ascent': 0, 'Descent': 0,
                       'FontBBox': spec['FontBBox']}
     PDFSimpleFont.__init__(self, descriptor, widths, spec)
     self.matrix = tuple(list_value(spec.get('FontMatrix')))
     (_, self.descent, _, self.ascent) = self.bbox
     (self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1))
Пример #34
0
    def init_resources(self, resources):
        """Prepare the fonts and XObjects listed in the Resource attribute."""
        self.resources = resources
        self.fontmap = {}
        self.xobjmap = {}
        self.csmap = PREDEFINED_COLORSPACE.copy()
        if not resources:
            return

        def get_colorspace(spec):
            if isinstance(spec, list):
                name = literal_name(spec[0])
            else:
                name = literal_name(spec)
            if name == 'ICCBased' and isinstance(spec, list) and 2 <= len(spec):
                return PDFColorSpace(name, stream_value(spec[1])['N'])
            elif name == 'DeviceN' and isinstance(spec, list) and 2 <= len(spec):
                return PDFColorSpace(name, len(list_value(spec[1])))
            else:
                return PREDEFINED_COLORSPACE.get(name)
        for (k, v) in dict_value(resources).iteritems():
            log.debug('Resource: %r: %r', k, v)
            if k == 'Font':
                for (fontid, spec) in dict_value(v).iteritems():
                    objid = None
                    if isinstance(spec, PDFObjRef):
                        objid = spec.objid
                    spec = dict_value(spec)
                    self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
            elif k == 'ColorSpace':
                for (csid, spec) in dict_value(v).iteritems():
                    self.csmap[csid] = get_colorspace(resolve1(spec))
            elif k == 'ProcSet':
                self.rsrcmgr.get_procset(list_value(v))
            elif k == 'XObject':
                for (xobjid, xobjstrm) in dict_value(v).iteritems():
                    self.xobjmap[xobjid] = xobjstrm
Пример #35
0
 def __init__(self, rsrcmgr, spec):
     firstchar = int_value(spec.get('FirstChar', 0))
     lastchar = int_value(spec.get('LastChar', 0))
     widths = list_value(spec.get('Widths', [0]*256))
     widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths))
     if 'FontDescriptor' in spec:
         descriptor = dict_value(spec['FontDescriptor'])
     else:
         descriptor = {'Ascent':0, 'Descent':0,
                       'FontBBox':spec['FontBBox']}
     PDFSimpleFont.__init__(self, descriptor, widths, spec)
     self.matrix = tuple(list_value(spec.get('FontMatrix')))
     (_,self.descent,_,self.ascent) = self.bbox
     (self.hscale,self.vscale) = apply_matrix_norm(self.matrix, (1,1))
     return
Пример #36
0
 def search(entry, level):
     entry = dict_value(entry)
     if 'Title' in entry:
         if 'A' in entry or 'Dest' in entry:
             title = decode_text(str_value(entry['Title']))
             dest = entry.get('Dest')
             action = entry.get('A')
             se = entry.get('SE')
             yield (level, title, dest, action, se)
     if 'First' in entry and 'Last' in entry:
         for x in search(entry['First'], level + 1):
             yield x
     if 'Next' in entry:
         for x in search(entry['Next'], level):
             yield x
     return
Пример #37
0
 def search(entry, level):
     entry = dict_value(entry)
     if 'Title' in entry:
         if 'A' in entry or 'Dest' in entry:
             title = decode_text(str_value(entry['Title']))
             dest = entry.get('Dest')
             action = entry.get('A')
             se = entry.get('SE')
             yield (level, title, dest, action, se)
     if 'First' in entry and 'Last' in entry:
         for x in search(entry['First'], level+1):
             yield x
     if 'Next' in entry:
         for x in search(entry['Next'], level):
             yield x
     return
Пример #38
0
 def __init__(self, rsrc, spec):
     try:
         self.basefont = literal_name(spec['BaseFont'])
     except KeyError:
         if STRICT:
             raise PDFFontError('BaseFont is missing')
         self.basefont = 'unknown'
     try:
         (descriptor, widths) = FontMetricsDB.get_metrics(self.basefont)
     except KeyError:
         descriptor = dict_value(spec.get('FontDescriptor', {}))
         firstchar = int_value(spec.get('FirstChar', 0))
         lastchar = int_value(spec.get('LastChar', 255))
         widths = list_value(spec.get('Widths', [0]*256))
         widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths) )
     PDFSimpleFont.__init__(self, descriptor, widths, spec)
     return
Пример #39
0
 def __init__(self, rsrc, spec):
     try:
         self.basefont = literal_name(spec['BaseFont'])
     except KeyError:
         if STRICT:
             raise PDFFontError('BaseFont is missing')
         self.basefont = 'unknown'
     try:
         (descriptor, widths) = FontMetricsDB.get_metrics(self.basefont)
     except KeyError:
         descriptor = dict_value(spec.get('FontDescriptor', {}))
         firstchar = int_value(spec.get('FirstChar', 0))
         lastchar = int_value(spec.get('LastChar', 255))
         widths = list_value(spec.get('Widths', [0] * 256))
         widths = dict((i + firstchar, w) for (i, w) in enumerate(widths))
     PDFSimpleFont.__init__(self, descriptor, widths, spec)
     return
Пример #40
0
 def get_font(self, objid, spec):
     if objid and objid in self._cached_fonts:
         font = self._cached_fonts[objid]
     else:
         if 2 <= self.debug:
             print >> sys.stderr, 'get_font: create: objid=%r, spec=%r' % (
                 objid, spec)
         if STRICT:
             if spec['Type'] is not LITERAL_FONT:
                 raise PDFFontError('Type is not /Font')
         # Create a Font object.
         if 'Subtype' in spec:
             subtype = literal_name(spec['Subtype'])
         else:
             if STRICT:
                 raise PDFFontError('Font Subtype is not specified.')
             subtype = 'Type1'
         if subtype in ('Type1', 'MMType1'):
             # Type1 Font
             font = PDFType1Font(self, spec)
         elif subtype == 'TrueType':
             # TrueType Font
             font = PDFTrueTypeFont(self, spec)
         elif subtype == 'Type3':
             # Type3 Font
             font = PDFType3Font(self, spec)
         elif subtype in ('CIDFontType0', 'CIDFontType2'):
             # CID Font
             font = PDFCIDFont(self, spec)
         elif subtype == 'Type0':
             # Type0 Font
             dfonts = list_value(spec['DescendantFonts'])
             assert dfonts
             subspec = dict_value(dfonts[0]).copy()
             for k in ('Encoding', 'ToUnicode'):
                 if k in spec:
                     subspec[k] = resolve1(spec[k])
             font = self.get_font(None, subspec)
         else:
             if STRICT:
                 raise PDFFontError('Invalid Font spec: %r' % spec)
             font = PDFType1Font(self, spec)  # this is so wrong!
         if objid and self.caching:
             self._cached_fonts[objid] = font
     return font
Пример #41
0
 def init_params(self):
     super(PDFStandardSecurityHandlerV4, self).init_params()
     self.length = 128
     self.cf = dict_value(self.param.get('CF'))
     self.stmf = literal_name(self.param['StmF'])
     self.strf = literal_name(self.param['StrF'])
     self.encrypt_metadata = bool(self.param.get('EncryptMetadata', True))
     if self.stmf != self.strf:
         raise PDFEncryptionError('Unsupported crypt filter: param=%r' % self.param)
     self.cfm = {}
     for k, v in self.cf.items():
         f = self.get_cfm(literal_name(v['CFM']))
         if f is None:
             raise PDFEncryptionError('Unknown crypt filter method: param=%r' % self.param)
         self.cfm[k] = f
     self.cfm['Identity'] = self.decrypt_identity
     if self.strf not in self.cfm:
         raise PDFEncryptionError('Undefined crypt filter: param=%r' % self.param)
Пример #42
0
 def get_font(self, objid, spec):
     if objid and objid in self._cached_fonts:
         font = self._cached_fonts[objid]
     else:
         if 2 <= self.debug:
             print >>sys.stderr, "get_font: create: objid=%r, spec=%r" % (objid, spec)
         if STRICT:
             if spec["Type"] is not LITERAL_FONT:
                 raise PDFFontError("Type is not /Font")
         # Create a Font object.
         if "Subtype" in spec:
             subtype = literal_name(spec["Subtype"])
         else:
             if STRICT:
                 raise PDFFontError("Font Subtype is not specified.")
             subtype = "Type1"
         if subtype in ("Type1", "MMType1"):
             # Type1 Font
             font = PDFType1Font(self, spec)
         elif subtype == "TrueType":
             # TrueType Font
             font = PDFTrueTypeFont(self, spec)
         elif subtype == "Type3":
             # Type3 Font
             font = PDFType3Font(self, spec)
         elif subtype in ("CIDFontType0", "CIDFontType2"):
             # CID Font
             font = PDFCIDFont(self, spec)
         elif subtype == "Type0":
             # Type0 Font
             dfonts = list_value(spec["DescendantFonts"])
             assert dfonts
             subspec = dict_value(dfonts[0]).copy()
             for k in ("Encoding", "ToUnicode"):
                 if k in spec:
                     subspec[k] = resolve1(spec[k])
             font = self.get_font(None, subspec)
         else:
             if STRICT:
                 raise PDFFontError("Invalid Font spec: %r" % spec)
             font = PDFType1Font(self, spec)  # this is so wrong!
         if objid and self.caching:
             self._cached_fonts[objid] = font
     return font
 def get_font(self, objid, spec):
     if objid and objid in self._cached_fonts:
         font = self._cached_fonts[objid]
     else:
         if 2 <= self.debug:
             print >>sys.stderr, 'get_font: create: objid=%r, spec=%r' % (objid, spec)
         if STRICT:
             if spec['Type'] is not LITERAL_FONT:
                 raise PDFFontError('Type is not /Font')
         # Create a Font object.
         if 'Subtype' in spec:
             subtype = literal_name(spec['Subtype'])
         else:
             if STRICT:
                 raise PDFFontError('Font Subtype is not specified.')
             subtype = 'Type1'
         if subtype in ('Type1', 'MMType1'):
             # Type1 Font
             font = PDFType1Font(self, spec)
         elif subtype == 'TrueType':
             # TrueType Font
             font = PDFTrueTypeFont(self, spec)
         elif subtype == 'Type3':
             # Type3 Font
             font = PDFType3Font(self, spec)
         elif subtype in ('CIDFontType0', 'CIDFontType2'):
             # CID Font
             font = PDFCIDFont(self, spec)
         elif subtype == 'Type0':
             # Type0 Font
             dfonts = list_value(spec['DescendantFonts'])
             assert dfonts
             subspec = dict_value(dfonts[0]).copy()
             for k in ('Encoding', 'ToUnicode'):
                 if k in spec:
                     subspec[k] = resolve1(spec[k])
             font = self.get_font(None, subspec)
         else:
             if STRICT:
                 raise PDFFontError('Invalid Font spec: %r' % spec)
             font = PDFType1Font(self, spec) # this is so wrong!
         if objid and self.caching:
             self._cached_fonts[objid] = font
     return font
Пример #44
0
 def __init__(self, rsrcmgr, spec):
     try:
         self.basefont = literal_name(spec['BaseFont'])
     except KeyError:
         handle_error(PDFFontError, 'BaseFont is missing')
         self.basefont = 'unknown'
     self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
     self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', 'unknown'),
                                 self.cidsysteminfo.get('Ordering', 'unknown'))
     try:
         name = literal_name(spec['Encoding'])
     except KeyError:
         handle_error(PDFFontError, 'Encoding is unspecified')
         name = 'unknown'
     try:
         self.cmap = CMapDB.get_cmap(name)
     except CMapDB.CMapNotFound, e:
         handle_error(PDFFontError, str(e))
         self.cmap = CMap()
Пример #45
0
 def __init__(self, rsrc, spec):
     try:
         self.basefont = literal_name(spec['BaseFont'])
     except KeyError:
         if STRICT:
             raise PDFFontError('BaseFont is missing')
         self.basefont = 'unknown'
     self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
     self.cidcoding = '%s-%s' % (self.cidsysteminfo.get(
         'Registry',
         'unknown'), self.cidsysteminfo.get('Ordering', 'unknown'))
     try:
         name = literal_name(spec['Encoding'])
     except KeyError:
         if STRICT:
             raise PDFFontError('Encoding is unspecified')
         name = 'unknown'
     try:
         self.cmap = rsrc.get_cmap(name, strict=STRICT)
     except CMapDB.CMapNotFound, e:
         raise PDFFontError(e)
Пример #46
0
 def __init__(self, rsrcmgr, spec):
     try:
         self.basefont = literal_name(spec['BaseFont'])
     except KeyError:
         handle_error(PDFFontError, 'BaseFont is missing')
         self.basefont = 'unknown'
     try:
         (descriptor, widths) = FontMetricsDB.get_metrics(self.basefont)
     except KeyError:
         descriptor = dict_value(spec.get('FontDescriptor', {}))
         firstchar = int_value(spec.get('FirstChar', 0))
         lastchar = int_value(spec.get('LastChar', 255))
         widths = list_value(spec.get('Widths', [0] * 256))
         widths = dict((i+firstchar, w) for (i, w) in enumerate(widths))
     PDFSimpleFont.__init__(self, descriptor, widths, spec)
     if 'Encoding' not in spec and 'FontFile' in descriptor:
         # try to recover the missing encoding info from the font file.
         self.fontfile = stream_value(descriptor.get('FontFile'))
         length1 = int_value(self.fontfile['Length1'])
         data = self.fontfile.get_data()[:length1]
         parser = Type1FontHeaderParser(StringIO(data))
         self.cid2unicode = parser.get_encoding()
Пример #47
0
 def __init__(self, rsrcmgr, spec):
     try:
         self.basefont = literal_name(spec['BaseFont'])
     except KeyError:
         if STRICT:
             raise PDFFontError('BaseFont is missing')
         self.basefont = 'unknown'
     try:
         (descriptor, widths) = FontMetricsDB.get_metrics(self.basefont)
     except KeyError:
         descriptor = dict_value(spec.get('FontDescriptor', {}))
         firstchar = int_value(spec.get('FirstChar', 0))
         lastchar = int_value(spec.get('LastChar', 255))
         widths = list_value(spec.get('Widths', [0]*256))
         widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths) )
     PDFSimpleFont.__init__(self, descriptor, widths, spec)
     if 'Encoding' not in spec and 'FontFile' in descriptor:
         # try to recover the missing encoding info from the font file.
         self.fontfile = stream_value(descriptor.get('FontFile'))
         length1 = int_value(self.fontfile['Length1'])
         data = self.fontfile.get_data()[:length1]
         parser = Type1FontHeaderParser(StringIO(data))
         self.cid2unicode = parser.get_encoding()
     return
Пример #48
0
    def __init__(self, rsrcmgr, spec):
        try:
            self.basefont = literal_name(spec['BaseFont'])
        except KeyError:
            if STRICT:
                raise PDFFontError('BaseFont is missing')
            self.basefont = 'unknown'
        self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
        self.cidcoding = '%s-%s' % (self.cidsysteminfo.get(
            'Registry',
            'unknown'), self.cidsysteminfo.get('Ordering', 'unknown'))
        try:
            name = literal_name(spec['Encoding'])
        except KeyError:
            if STRICT:
                raise PDFFontError('Encoding is unspecified')
            name = 'unknown'
        try:
            self.cmap = CMapDB.get_cmap(name)
        except CMapDB.CMapNotFound as e:
            if STRICT:
                raise PDFFontError(e)
            self.cmap = CMap()
        try:
            descriptor = dict_value(spec['FontDescriptor'])
        except KeyError:
            if STRICT:
                raise PDFFontError('FontDescriptor is missing')
            descriptor = {}
        ttf = None
        if 'FontFile2' in descriptor:
            self.fontfile = stream_value(descriptor.get('FontFile2'))
            ttf = TrueTypeFont(self.basefont,
                               StringIO(self.fontfile.get_data()))
        self.unicode_map = None
        if 'ToUnicode' in spec:
            strm = stream_value(spec['ToUnicode'])
            self.unicode_map = FileUnicodeMap()
            CMapParser(self.unicode_map, StringIO(strm.get_data())).run()
        elif self.cidcoding == 'Adobe-Identity':
            if ttf:
                try:
                    self.unicode_map = ttf.create_unicode_map()
                except TrueTypeFont.CMapNotFound:
                    pass
        else:
            try:
                self.unicode_map = CMapDB.get_unicode_map(
                    self.cidcoding, self.cmap.is_vertical())
            except CMapDB.CMapNotFound as e:
                pass

        self.vertical = self.cmap.is_vertical()
        if self.vertical:
            # writing mode: vertical
            widths = get_widths2(list_value(spec.get('W2', [])))
            self.disps = dict(
                (cid, (vx, vy)) for (cid, (_, (vx, vy))) in widths.iteritems())
            (vy, w) = spec.get('DW2', [880, -1000])
            self.default_disp = (None, vy)
            widths = dict((cid, w) for (cid, (w, _)) in widths.iteritems())
            default_width = w
        else:
            # writing mode: horizontal
            self.disps = {}
            self.default_disp = 0
            widths = get_widths(list_value(spec.get('W', [])))
            default_width = spec.get('DW', 1000)
        PDFFont.__init__(self, descriptor, widths, default_width=default_width)
        return
Пример #49
0
    def do_keyword(self, pos, token):
        """Handles PDF-related keywords."""
        
        if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
            self.add_results(*self.pop(1))
        
        elif token is self.KEYWORD_ENDOBJ:
            self.add_results(*self.pop(4))

        elif token is self.KEYWORD_NULL:
            # null object
            self.push((pos, None))

        elif token is self.KEYWORD_R:
            # reference to indirect object
            try:
                ((_,objid), (_,genno)) = self.pop(2)
                (objid, genno) = (int(objid), int(genno))
                obj = PDFObjRef(self.doc, objid, genno)
                self.push((pos, obj))
            except PSSyntaxError:
                pass

        elif token is self.KEYWORD_STREAM:
            # stream object
            ((_,dic),) = self.pop(1)
            dic = dict_value(dic)
            objlen = 0
            if not self.fallback:
                try:
                    objlen = int_value(dic['Length'])
                except KeyError:
                    if STRICT:
                        raise PDFSyntaxError('/Length is undefined: %r' % dic)
            self.seek(pos)
            try:
                (_, line) = self.nextline()  # 'stream'
            except PSEOF:
                if STRICT:
                    raise PDFSyntaxError('Unexpected EOF')
                return
            pos += len(line)
            self.fp.seek(pos)
            data = self.fp.read(objlen)
            self.seek(pos+objlen)
            while 1:
                try:
                    (linepos, line) = self.nextline()
                except PSEOF:
                    if STRICT:
                        raise PDFSyntaxError('Unexpected EOF')
                    break
                if 'endstream' in line:
                    i = line.index('endstream')
                    objlen += i
                    data += line[:i]
                    break
                objlen += len(line)
                data += line
            self.seek(pos+objlen)
            # XXX limit objlen not to exceed object boundary
            if 2 <= self.debug:
                print >>sys.stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \
                      (pos, objlen, dic, data[:10])
            obj = PDFStream(dic, data, self.doc.decipher)
            self.push((pos, obj))

        else:
            # others
            self.push((pos, token))
        
        return