示例#1
0
 def init_resources(self, resources):
   self.fontmap = {}
   self.xobjmap = {}
   self.csmap = PREDEFINED_COLORSPACE.copy()
   if not resources: return
   def get_colorspace(spec):
     if isinstance(spec, list):
       name = literal_name(spec[0])
     else:
       name = literal_name(spec)
     if name == 'ICCBased' and isinstance(spec, list) and 2 <= len(spec):
       return PDFColorSpace(name, stream_value(spec[1]).dic['N'])
     elif name == 'DeviceN' and isinstance(spec, list) and 2 <= len(spec):
       return PDFColorSpace(name, len(list_value(spec[1])))
     else:
       return PREDEFINED_COLORSPACE[name]
   for (k,v) in dict_value(resources).iteritems():
     if 1 <= self.debug:
       print >>stderr, 'Resource: %r: %r' % (k,v)
     if k == 'Font':
       for (fontid,spec) in dict_value(v).iteritems():
         objid = None
         if isinstance(spec, PDFObjRef):
           objid = spec.objid
         spec = dict_value(spec)
         self.fontmap[fontid] = self.rsrc.get_font(objid, spec)
     elif k == 'ColorSpace':
       for (csid,spec) in dict_value(v).iteritems():
         self.csmap[csid] = get_colorspace(resolve1(spec))
     elif k == 'ProcSet':
       self.rsrc.get_procset(list_value(v))
     elif k == 'XObject':
       for (xobjid,xobjstrm) in dict_value(v).iteritems():
         self.xobjmap[xobjid] = xobjstrm
   return
示例#2
0
 def set_parser(self, parser):
   if self.parser: return
   self.parser = parser
   # The document is set to be temporarily ready during collecting
   # all the basic information about the document, e.g.
   # the header, the encryption information, and the access rights
   # for the document.
   self.ready = True
   # Retrieve the information of each header that was appended
   # (maybe multiple times) at the end of the document.
   self.xrefs = parser.read_xref()
   for xref in self.xrefs:
     trailer = xref.trailer
     if not trailer: continue
     # If there's an encryption info, remember it.
     if 'Encrypt' in trailer:
       #assert not self.encryption
       self.encryption = (list_value(trailer['ID']),
                          dict_value(trailer['Encrypt']))
     if 'Root' in trailer:
       self.set_root(dict_value(trailer['Root']))
       break
   else:
     raise PDFSyntaxError('No /Root object! - Is this really a PDF?')
   # The document is set to be non-ready again, until all the
   # proper initialization (asking the password key and
   # verifying the access permission, so on) is finished.
   self.ready = False
   return
示例#3
0
 def set_parser(self, parser):
     if self.parser: return
     self.parser = parser
     # The document is set to be temporarily ready during collecting
     # all the basic information about the document, e.g.
     # the header, the encryption information, and the access rights
     # for the document.
     self.ready = True
     # Retrieve the information of each header that was appended
     # (maybe multiple times) at the end of the document.
     self.xrefs = parser.read_xref()
     for xref in self.xrefs:
         trailer = xref.trailer
         if not trailer: continue
         # If there's an encryption info, remember it.
         if 'Encrypt' in trailer:
             #assert not self.encryption
             self.encryption = (list_value(trailer['ID']),
                                dict_value(trailer['Encrypt']))
         if 'Root' in trailer:
             self.set_root(dict_value(trailer['Root']))
             break
     else:
         raise PDFSyntaxError('No /Root object! - Is this really a PDF?')
     # The document is set to be non-ready again, until all the
     # proper initialization (asking the password key and
     # verifying the access permission, so on) is finished.
     self.ready = False
     return
示例#4
0
 def __validateOutline(self, outl, id):
     if outl.get("Next") != None:
         self.__validateOutline(dict_value(outl.get("Next")),
                                outl.get("Next").objid)
     if outl.get("First") != None:
         self.__validateOutline(dict_value(outl.get("First")),
                                outl.get("First").objid)
     if outl.get("A") != None:
         self.__validateAction(dict_value(outl.get("A")),
                               "in outline " + str(id))
示例#5
0
 def validate(self, fileName):
     self.__loadDocument(fileName)
     self.__write("GLOBAL:")
     self.__validateDocumentCatalog(self.__doc.catalog)
     j = 0
     for p in self.__doc.get_pages():
         j += 1
         self.__write("PAGE " + str(j) + ":", error=False)
         images = dict_value(p.resources.get("XObject"))
         for (k, v) in images.iteritems():
             self.__validateXObjectDictionary(dict_value(v),
                                              literal_name(k))
         gstates = dict_value(p.resources.get("ExtGState"))
         for (k, v) in gstates.iteritems():
             self.__validateGraphicsStateParameterDictionary(
                 dict_value(v), literal_name(k))
         # TODO: V czy w ten sposob sprawdzimy wszystkie wzorce o typie 2?
         patterns = dict_value(p.resources.get("Pattern"))
         for (k, v) in patterns.iteritems():
             self.__validatePattern(dict_value(v), literal_name(k))
         i = -1
         for a in list_value(p.annots):
             i += 1
             self.__validateAnnotationDictionary(
                 dict_value(a),
                 str(i) + " on page " + p.pageid)
         # TODO: V powinno byc sprawdzane, czy font jest uzywany (p. 6.3.4)
         fonts = dict_value(p.resources.get("Font"))
         for (k, v) in fonts.iteritems():
             self.__validateFont(dict_value(v), literal_name(k))
         self.__interp.process_page(p)
示例#6
0
 def __validateDocumentCatalog(self, doc):
     # 6.6.2
     if doc.get("AA") != None:
         self.__write("Document catalog contains AA entry")
     # 6.8.2.2
     if doc.get("MarkInfo") == None:
         self.__write("Document catalog does not contain MarkInfo entry")
     else:
         if not dict_value(doc.get("MarkInfo")).get("Marked"):
             self.__write(
                 "Marked flag in mark information dictionary is not set")
     # 6.8.4
     if doc.get("Lang") == None:
         self.__write("Document catalog does not specify language")
     # 6.1.11
     if doc.get("Names") != None:
         if dict_value(doc.get("Names")).get("EmbeddedFiles") != None:
             self.__write(
                 "Document name dictionary contains EmbeddedFiles key")
     # 6.1.13
     if doc.get("OCProperties") != None:
         self.__write("Document catalog contains OCProperties key")
     if doc.get("AcroForm") != None:
         i = -1
         for f in list_value(dict_value(doc.get("AcroForm")).get("Fields")):
             i += 1
             self.__validateField(dict_value(f), str(i) + " in AcroForm")
     if doc.get("Outlines") != None:
         self.__validateOutline(
             dict_value(dict_value(doc.get("Outlines")).get("First")),
             dict_value(doc.get("Outlines")).get("First").objid)
     if dict_value2(doc.get("OpenAction")) != None:
         self.__validateAction(dict_value(doc.get("OpenAction")),
                               "OpenAction from document catalog")
    def do_Do(self, xobjid):
        # the base of this function is basically copy-pasted from ancestor; unfortunately, I found no better solution
        xobjid = literal_name(xobjid)
        try:
            xobj = stream_value(self.xobjmap[xobjid])
        except KeyError:
            if STRICT:
                raise PDFInterpreterError("Undefined xobject id: %r" % xobjid)
            return
        if self.debug:
            logging.info("Processing xobj: %r" % xobj)
        subtype = xobj.get("Subtype")
        if subtype is LITERAL_FORM and "BBox" in xobj:
            interpreter = self.dup()
            interpreter.is_first_level_call = None
            bbox = list_value(xobj["BBox"])
            matrix = list_value(xobj.get("Matrix", MATRIX_IDENTITY))
            # According to PDF reference 1.7 section 4.9.1, XObjects in
            # earlier PDFs (prior to v1.2) use the page's Resources entry
            # instead of having their own Resources entry.
            resources = dict_value(xobj.get("Resources")) or self.resources.copy()

            self.device.begin_figure(xobjid, bbox, matrix)
            interpreter.render_contents(resources, [xobj], ctm=mult_matrix(matrix, self.ctm))
            self.device.end_figure(xobjid)

            # for (k,v) in interpreter.text_lines.iteritems():
            #     self.text_sequences[k + self.keyword_count] = v
            self.keyword_count += interpreter.keyword_count
            print "Included %i keywords" % interpreter.keyword_count
        else:
            # ignored xobject type.
            pass
        return
示例#8
0
 def set_root(self, root):
   self.root = root
   self.catalog = dict_value(self.root)
   if self.catalog.get('Type') is not LITERAL_CATALOG:
     if STRICT:
       raise PDFSyntaxError('Catalog not found!')
   return
示例#9
0
 def set_root(self, root):
     self.root = root
     self.catalog = dict_value(self.root)
     if self.catalog.get('Type') is not LITERAL_CATALOG:
         if STRICT:
             raise PDFSyntaxError('Catalog not found!')
     return
示例#10
0
 def __validateXObjectDictionary(self, dict, id):
     if literal_name(dict.get("Subtype")) == "Form":
         # 6.2.5
         if dict.get("Ref") != None:
             # 6.2.6
             self.__write("XObject dictionary " + str(id) +
                          " is a reference XObject")
         if dict.get("OPI") != None:
             self.__write("Form XObject dictionary " + str(id) +
                          " contains OPI entry")
         # TODO: NOTE ale w reference 3 nie ma nic o Subtype2 i PS
         if dict.get("Subtype2") != None:
             if literal_name(dict.get("Subtype2")) == "PS":
                 self.__write("Form XObject dictionary " + str(id) +
                              " contains" + "Subtype2 entry with PS value")
         if dict.get("PS") != None:
             self.__write("Form XObject dictionary " + str(id) +
                          " contains PS entry")
         if dict.get("Group") != None:
             # 6.4
             groupDict = dict_value(dict.get("Group"))
             if literal_name(groupDict.get("S")) == "Transparency":
                 self.__write(
                     "Form XObject dictionary " + str(id) +
                     "contains Group entry which S attribute value" +
                     " id /Transparency")
     elif literal_name(dict.get("Subtype")) == "PS":
         # 6.2.7
         self.__write("Document contains PostScript XObject " + str(id))
     elif literal_name(dict.get("Subtype")) == "Image":
         self.__validateImageDictionary(dict, str(id))
示例#11
0
    def __init__(self, doc, pageid, attrs):
        """Initialize a page object.

        doc: a PDFDocument object.
        pageid: any Python object that can uniquely identify the page.
        attrs: a dictionary of page attributes.
        """
        self.doc = doc
        self.pageid = pageid
        self.attrs = dict_value(attrs)
        self.lastmod = resolve1(self.attrs.get('LastModified'))
        self.resources = resolve1(self.attrs['Resources'])
        self.mediabox = resolve1(self.attrs['MediaBox'])
        if 'CropBox' in self.attrs:
            self.cropbox = resolve1(self.attrs['CropBox'])
        else:
            self.cropbox = self.mediabox
        self.rotate = (int_value(self.attrs.get('Rotate', 0))+360) % 360
        self.annots = self.attrs.get('Annots')
        self.beads = self.attrs.get('B')
        if 'Contents' in self.attrs:
            contents = resolve1(self.attrs['Contents'])
        else:
            contents = []
        if not isinstance(contents, list):
            contents = [contents]
        self.contents = contents
        self.number_of_pages = 0
        return
示例#12
0
 def do_Do(self, xobjid):
   xobjid = literal_name(xobjid)
   try:
     xobj = stream_value(self.xobjmap[xobjid])
   except KeyError:
     if STRICT:
       raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
     return
   if 1 <= self.debug:
     print >>stderr, 'Processing xobj: %r' % xobj
   subtype = xobj.dic.get('Subtype')
   if subtype is LITERAL_FORM and 'BBox' in xobj.dic:
     interpreter = self.dup()
     bbox = list_value(xobj.dic['BBox'])
     matrix = list_value(xobj.dic.get('Matrix', MATRIX_IDENTITY))
     self.device.begin_figure(xobjid, bbox, matrix)
     interpreter.render_contents(dict_value(xobj.dic.get('Resources')), [xobj], ctm=mult_matrix(matrix, self.ctm))
     self.device.end_figure(xobjid)
   elif subtype is LITERAL_IMAGE and 'Width' in xobj.dic and 'Height' in xobj.dic:
     self.device.begin_figure(xobjid, (0,0,1,1), MATRIX_IDENTITY)
     (w,h) = (xobj.dic['Width'], xobj.dic['Height'])
     self.device.render_image(xobj, (w,h))
     self.device.end_figure(xobjid)
   else:
     # unsupported xobject type.
     pass
   return
示例#13
0
        def search(obj, parent):
            if isinstance(obj, int):
                objid = obj
                tree = dict_value(document.getobj(objid)).copy()
            else:
                objid = obj.objid
                tree = dict_value(obj).copy()

            for (k, v) in parent.items():
                if k in 'Resources' and k not in tree:
                    tree[k] = v

            tree_type = tree.get('Type')
            if tree_type is LITERAL_PAGES and 'Kids' in tree:
                for c in list_value(tree['Kids']):
                    for x in search(c, tree):
                        yield x

            elif tree_type is LITERAL_PAGE:
                yield (objid, tree)
示例#14
0
 def search(obj, parent):
     if isinstance(obj, int):
         objid = obj
         tree = dict_value(document.getobj(objid)).copy()
     else:
         objid = obj.objid
         tree = dict_value(obj).copy()
     for (k, v) in parent.iteritems():
         if k in klass.INHERITABLE_ATTRS and k not in tree:
             tree[k] = v
     if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
         if 1 <= debug:
             print >>sys.stderr, 'Pages: Kids=%r' % tree['Kids']
         for c in list_value(tree['Kids']):
             for x in search(c, tree):
                 yield x
     elif tree.get('Type') is LITERAL_PAGE:
         if 1 <= debug:
             print >>sys.stderr, 'Page: %r' % tree
         yield (objid, tree)
示例#15
0
 def load_trailer(self, parser):
   try:
     (_,kwd) = parser.nexttoken()
     assert kwd is self.KEYWORD_TRAILER
     (_,dic) = parser.nextobject()
   except PSEOF:
     x = parser.pop(1)
     if not x:
       raise PDFNoValidXRef('Unexpected EOF - file corrupted')
     (_,dic) = x[0]
   self.trailer.update( dict_value(dic))
   return
示例#16
0
 def lookup_name(self, cat, key):
   try:
     names = dict_value(self.catalog['Names'])
   except (PDFTypeError, KeyError):
     raise KeyError((cat,key))
   # may raise KeyError
   d0 = dict_value(names[cat])
   def lookup(d):
     if 'Limits' in d:
       (k1,k2) = list_value(d['Limits'])
       if key < k1 or k2 < key: return None
       if 'Names' in d:
         objs = list_value(d['Names'])
         names = dict(choplist(2, objs))
         return names[key]
     if 'Kids' in d:
       for c in list_value(d['Kids']):
         v = lookup(dict_value(c))
         if v: return v
     raise KeyError((cat,key))
   return lookup(d0)
示例#17
0
 def load_trailer(self, parser):
     try:
         (_, kwd) = parser.nexttoken()
         assert kwd is self.KEYWORD_TRAILER
         (_, dic) = parser.nextobject()
     except PSEOF:
         x = parser.pop(1)
         if not x:
             raise PDFNoValidXRef('Unexpected EOF - file corrupted')
         (_, dic) = x[0]
     self.trailer.update(dict_value(dic))
     return
示例#18
0
 def lookup(d):
     if 'Limits' in d:
         (k1, k2) = list_value(d['Limits'])
         if key < k1 or k2 < key: return None
         if 'Names' in d:
             objs = list_value(d['Names'])
             names = dict(choplist(2, objs))
             return names[key]
     if 'Kids' in d:
         for c in list_value(d['Kids']):
             v = lookup(dict_value(c))
             if v: return v
     raise KeyError((cat, key))
示例#19
0
 def search(obj, parent):
     global pageno
     global result_pages
     if isinstance(obj, int):
         objid = obj
         tree = dict_value(document.getobj(objid)).copy()
     else:
         objid = obj.objid
         tree = dict_value(obj).copy()
     for (k, v) in parent.iteritems():
         if k in klass.INHERITABLE_ATTRS and k not in tree:
             tree[k] = v
     if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
         if klass.debug: logging.info('Pages: Kids=%r' % tree['Kids'])
         for c in list_value(tree['Kids']):
             for x in search(c, tree):
                 yield x
     elif tree.get('Type') is LITERAL_PAGE:
         if klass.debug: logging.info('Page: %r' % tree)
         pageno += 1
         result_pages.append((objid, pageno))
         yield (objid, tree, pageno)
示例#20
0
 def lookup(d):
   if 'Limits' in d:
     (k1,k2) = list_value(d['Limits'])
     if key < k1 or k2 < key: return None
     if 'Names' in d:
       objs = list_value(d['Names'])
       names = dict(choplist(2, objs))
       return names[key]
   if 'Kids' in d:
     for c in list_value(d['Kids']):
       v = lookup(dict_value(c))
       if v: return v
   raise KeyError((cat,key))
示例#21
0
    def lookup_name(self, cat, key):
        try:
            names = dict_value(self.catalog['Names'])
        except (PDFTypeError, KeyError):
            raise KeyError((cat, key))
        # may raise KeyError
        d0 = dict_value(names[cat])

        def lookup(d):
            if 'Limits' in d:
                (k1, k2) = list_value(d['Limits'])
                if key < k1 or k2 < key: return None
                if 'Names' in d:
                    objs = list_value(d['Names'])
                    names = dict(choplist(2, objs))
                    return names[key]
            if 'Kids' in d:
                for c in list_value(d['Kids']):
                    v = lookup(dict_value(c))
                    if v: return v
            raise KeyError((cat, key))

        return lookup(d0)
示例#22
0
class PDFCIDFont(PDFFont):
    def __init__(self, rsrc, spec):
        try:
            self.basefont = literal_name(spec['BaseFont'])
        except KeyError:
            if STRICT:
                raise PDFFontError('BaseFont is missing')
            self.basefont = 'unknown'
        self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
        self.cidcoding = '%s-%s' % (self.cidsysteminfo.get(
            'Registry',
            'unknown'), self.cidsysteminfo.get('Ordering', 'unknown'))
        try:
            name = literal_name(spec['Encoding'])
        except KeyError:
            if STRICT:
                raise PDFFontError('Encoding is unspecified')
            name = 'unknown'
        try:
            self.cmap = rsrc.get_cmap(name, strict=STRICT)
        except CMapDB.CMapNotFound, e:
            raise PDFFontError(e)
        try:
            descriptor = dict_value(spec['FontDescriptor'])
        except KeyError:
            if STRICT:
                raise PDFFontError('FontDescriptor is missing')
            descriptor = {}
        ttf = None
        if 'FontFile2' in descriptor:
            self.fontfile = stream_value(descriptor.get('FontFile2'))
            ttf = TrueTypeFont(self.basefont,
                               StringIO(self.fontfile.get_data()))
        self.ucs2_cmap = None
        if 'ToUnicode' in spec:
            strm = stream_value(spec['ToUnicode'])
            self.ucs2_cmap = CMap()
            CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run()
        elif self.cidcoding == 'Adobe-Identity':
            if ttf:
                try:
                    self.ucs2_cmap = ttf.create_cmap()
                except TrueTypeFont.CMapNotFound:
                    pass
        else:
            try:
                self.ucs2_cmap = rsrc.get_cmap('%s-UCS2' % self.cidcoding,
                                               strict=STRICT)
            except CMapDB.CMapNotFound, e:
                raise PDFFontError(e)
示例#23
0
 def search(obj, parent):
     tree = dict_value(obj).copy()
     for (k, v) in parent.iteritems():
         if k in self.INHERITABLE_ATTRS and k not in tree:
             tree[k] = v
     if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
         if 1 <= self.debug:
             print >> stderr, 'Pages: Kids=%r' % tree['Kids']
         for c in tree['Kids']:
             for x in search(c, tree):
                 yield x
     elif tree.get('Type') is LITERAL_PAGE:
         if 1 <= self.debug:
             print >> stderr, 'Page: %r' % tree
         yield (obj.objid, tree)
示例#24
0
 def search(obj, parent):
   tree = dict_value(obj).copy()
   for (k,v) in parent.iteritems():
     if k in self.INHERITABLE_ATTRS and k not in tree:
       tree[k] = v
   if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
     if 1 <= self.debug:
       print >>stderr, 'Pages: Kids=%r' % tree['Kids']
     for c in tree['Kids']:
       for x in search(c, tree):
         yield x
   elif tree.get('Type') is LITERAL_PAGE:
     if 1 <= self.debug:
       print >>stderr, 'Page: %r' % tree
     yield (obj.objid, tree)
示例#25
0
def init_process_pdf(fp, password=''):
# Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument()
    # Connect the parser and document objects.
    parser.set_document(doc)
    doc.set_parser(parser)
    # Supply the document password for initialization.
    # (If no password is set, give an empty string.)
    doc.initialize(password)
    # Check if the document allows text extraction. If not, abort.
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed('Extraction is not allowed: %r' % fp)
    return (doc, num_value(dict_value(doc.catalog.get("Pages")).get("Count")))
示例#26
0
 def search(entry, level):
     entry = dict_value(entry)
     if 'Title' in entry:
         if 'A' in entry or 'Dest' in entry:
             title = decode_text(str_value(entry['Title']))
             dest = entry.get('Dest')
             action = entry.get('A')
             se = entry.get('SE')
             yield (level, title, dest, action, se)
     if 'First' in entry and 'Last' in entry:
         for x in search(entry['First'], level + 1):
             yield x
     if 'Next' in entry:
         for x in search(entry['Next'], level):
             yield x
     return
示例#27
0
 def __init__(self, rsrc, spec):
   firstchar = int_value(spec.get('FirstChar', 0))
   lastchar = int_value(spec.get('LastChar', 0))
   widths = list_value(spec.get('Widths', [0]*256))
   widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths))
   if 'FontDescriptor' in spec:
     descriptor = dict_value(spec['FontDescriptor'])
   else:
     descriptor = {'FontName':spec.get('Name'),
                   'Ascent':0, 'Descent':0,
                   'FontBBox':spec['FontBBox']}
   PDFSimpleFont.__init__(self, descriptor, widths, spec)
   self.matrix = tuple(list_value(spec.get('FontMatrix')))
   (_,self.descent,_,self.ascent) = self.bbox
   (self.hscale,self.vscale) = apply_matrix_norm(self.matrix, (1,1))
   return
示例#28
0
 def search(entry, level):
   entry = dict_value(entry)
   if 'Title' in entry:
     if 'A' in entry or 'Dest' in entry:
       title = decode_text(str_value(entry['Title']))
       dest = entry.get('Dest')
       action = entry.get('A')
       se = entry.get('SE')
       yield (level, title, dest, action, se)
   if 'First' in entry and 'Last' in entry:
     for x in search(entry['First'], level+1):
       yield x
   if 'Next' in entry:
     for x in search(entry['Next'], level):
       yield x
   return
示例#29
0
 def get_font(self, objid, spec):
     if objid and objid in self._cached_fonts:
         font = self._cached_fonts[objid]
     else:
         if settings.STRICT:
             if spec['Type'] is not LITERAL_FONT:
                 raise PDFFontError('Type is not /Font')
         # Create a Font object.
         if 'Subtype' in spec:
             subtype = literal_name(spec['Subtype'])
         else:
             if settings.STRICT:
                 raise PDFFontError('Font Subtype is not specified.')
             subtype = 'Type1'
         if subtype in ('Type1', 'MMType1'):
             # Type1 Font
             font = PDFType1Font(self, spec)
         elif subtype == 'TrueType':
             # TrueType Font
             font = PDFTrueTypeFont(self, spec)
         elif subtype == 'Type3':
             # Type3 Font
             font = PDFType3Font(self, spec)
         elif subtype in ('CIDFontType0', 'CIDFontType2'):
             # CID Font - Ensure recursive object references have been resolved
             if type(spec['CIDSystemInfo']) is not PDFObjRef:
                 for k in spec['CIDSystemInfo']:
                     if type(spec['CIDSystemInfo'][k]) is PDFObjRef:
                         spec['CIDSystemInfo'][k] = spec['CIDSystemInfo'][
                             k].resolve()
             font = PDFCIDFont(self, spec)
         elif subtype == 'Type0':
             # Type0 Font
             dfonts = list_value(spec['DescendantFonts'])
             assert dfonts
             subspec = dict_value(dfonts[0]).copy()
             for k in ('Encoding', 'ToUnicode'):
                 if k in spec:
                     subspec[k] = resolve1(spec[k])
             font = self.get_font(None, subspec)
         else:
             if settings.STRICT:
                 raise PDFFontError('Invalid Font spec: %r' % spec)
             font = PDFType1Font(self, spec)
         if objid and self.caching:
             self._cached_fonts[objid] = font
     return font
示例#30
0
 def __init__(self, rsrc, spec):
   try:
     self.basefont = literal_name(spec['BaseFont'])
   except KeyError:
     if STRICT:
       raise PDFFontError('BaseFont is missing')
     self.basefont = 'unknown'
   try:
     (descriptor, widths) = FontMetricsDB.get_metrics(self.basefont)
   except KeyError:
     descriptor = dict_value(spec.get('FontDescriptor', {}))
     firstchar = int_value(spec.get('FirstChar', 0))
     lastchar = int_value(spec.get('LastChar', 255))
     widths = list_value(spec.get('Widths', [0]*256))
     widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths) )
   PDFSimpleFont.__init__(self, descriptor, widths, spec)
   return
示例#31
0
 def __init__(self, rsrc, spec):
     try:
         self.basefont = literal_name(spec['BaseFont'])
     except KeyError:
         if STRICT:
             raise PDFFontError('BaseFont is missing')
         self.basefont = 'unknown'
     try:
         (descriptor, widths) = FontMetricsDB.get_metrics(self.basefont)
     except KeyError:
         descriptor = dict_value(spec.get('FontDescriptor', {}))
         firstchar = int_value(spec.get('FirstChar', 0))
         lastchar = int_value(spec.get('LastChar', 255))
         widths = list_value(spec.get('Widths', [0] * 256))
         widths = dict((i + firstchar, w) for (i, w) in enumerate(widths))
     PDFSimpleFont.__init__(self, descriptor, widths, spec)
     return
示例#32
0
 def __init__(self, rsrc, spec):
     firstchar = int_value(spec.get('FirstChar', 0))
     lastchar = int_value(spec.get('LastChar', 0))
     widths = list_value(spec.get('Widths', [0] * 256))
     widths = dict((i + firstchar, w) for (i, w) in enumerate(widths))
     if 'FontDescriptor' in spec:
         descriptor = dict_value(spec['FontDescriptor'])
     else:
         descriptor = {
             'FontName': spec.get('Name'),
             'Ascent': 0,
             'Descent': 0,
             'FontBBox': spec['FontBBox']
         }
     PDFSimpleFont.__init__(self, descriptor, widths, spec)
     self.matrix = tuple(list_value(spec.get('FontMatrix')))
     (_, self.descent, _, self.ascent) = self.bbox
     (self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1))
     return
示例#33
0
 def get_font(self, objid, spec):
   if objid and objid in self.fonts:
     font = self.fonts[objid]
   else:
     if STRICT:
       if spec['Type'] is not LITERAL_FONT:
         raise PDFFontError('Type is not /Font')
     # Create a Font object.
     if 'Subtype' in spec:
       subtype = literal_name(spec['Subtype'])
     else:
       if STRICT:
         raise PDFFontError('Font Subtype is not specified.')
       subtype = 'Type1'
     if subtype in ('Type1', 'MMType1'):
       # Type1 Font
       font = PDFType1Font(self, spec)
     elif subtype == 'TrueType':
       # TrueType Font
       font = PDFTrueTypeFont(self, spec)
     elif subtype == 'Type3':
       # Type3 Font
       font = PDFType3Font(self, spec)
     elif subtype in ('CIDFontType0', 'CIDFontType2'):
       # CID Font
       font = PDFCIDFont(self, spec)
     elif subtype == 'Type0':
       # Type0 Font
       dfonts = list_value(spec['DescendantFonts'])
       assert dfonts
       subspec = dict_value(dfonts[0]).copy()
       for k in ('Encoding', 'ToUnicode'):
         if k in spec:
           subspec[k] = resolve1(spec[k])
       font = self.get_font(None, subspec)
     else:
       if STRICT:
         raise PDFFontError('Invalid Font spec: %r' % spec)
       font = PDFType1Font(self, spec) # this is so wrong!
     if objid:
       self.fonts[objid] = font
   return font
示例#34
0
 def __init__(self, rsrc, spec):
   try:
     self.basefont = literal_name(spec['BaseFont'])
   except KeyError:
     if STRICT:
       raise PDFFontError('BaseFont is missing')
     self.basefont = 'unknown'
   self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
   self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', 'unknown'),
                               self.cidsysteminfo.get('Ordering', 'unknown'))
   try:
     name = literal_name(spec['Encoding'])
   except KeyError:
     if STRICT:
       raise PDFFontError('Encoding is unspecified')
     name = 'unknown'
   try:
     self.cmap = rsrc.get_cmap(name, strict=STRICT)
   except CMapDB.CMapNotFound, e:
     raise PDFFontError(e)
示例#35
0
 def __init__(self, rsrc, spec):
     try:
         self.basefont = literal_name(spec['BaseFont'])
     except KeyError:
         if STRICT:
             raise PDFFontError('BaseFont is missing')
         self.basefont = 'unknown'
     self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
     self.cidcoding = '%s-%s' % (self.cidsysteminfo.get(
         'Registry',
         'unknown'), self.cidsysteminfo.get('Ordering', 'unknown'))
     try:
         name = literal_name(spec['Encoding'])
     except KeyError:
         if STRICT:
             raise PDFFontError('Encoding is unspecified')
         name = 'unknown'
     try:
         self.cmap = rsrc.get_cmap(name, strict=STRICT)
     except CMapDB.CMapNotFound, e:
         raise PDFFontError(e)
示例#36
0
 def __init__(self, doc, pageid, attrs):
     self.doc = doc
     self.pageid = pageid
     self.attrs = dict_value(attrs)
     self.lastmod = resolve1(self.attrs.get('LastModified'))
     self.resources = resolve1(self.attrs.get('Resources', dict()))
     self.mediabox = resolve1(self.attrs['MediaBox'])
     if 'CropBox' in self.attrs:
         self.cropbox = resolve1(self.attrs['CropBox'])
     else:
         self.cropbox = self.mediabox
     self.rotate = (int_value(self.attrs.get('Rotate', 0)) + 360) % 360
     self.annots = self.attrs.get('Annots')
     self.beads = self.attrs.get('B')
     if 'Contents' in self.attrs:
         contents = resolve1(self.attrs['Contents'])
     else:
         contents = []
     if not isinstance(contents, list):
         contents = [contents]
     self.contents = contents
     return
示例#37
0
 def __validateAnnotationDictionary(self, dict, id):
     # 6.5
     if not literal_name(dict.get("Subtype")) in [
             "Text", "Link", "FreeText", "Line", "Square", "Circle",
             "Highlight", "Underline", "Squiggly", "StrikeOut", "Stamp",
             "Ink", "Popup", "Widget", "PrinterMark", "TrapNet"
     ]:
         self.__write("Annotation dictionary " + str(id) +
                      "contains invalid" + " Subtype entry")
     if dict.get("CA") != None:
         if num_value(dict.get("CA")) != 1.0:
             self.__write("Annotation dictionary " + str(id) +
                          " contains CA " + "entry which value isn't 1.0")
     if dict.get("F") != None:
         self.__write("Annotation dictionary " + str(id) +
                      " contains F entry")
     if literal_name(dict.get("Subtype")) == "Widget":
         self.__validateWidgetAnnotation(dict, id)
     else:
         if dict.get("A") != None:
             self.__validateAction(dict_value(dict.get("A")),
                                   "in annotation " + str(id))
示例#38
0
 def __init__(self, doc, pageid, attrs):
   self.doc = doc
   self.pageid = pageid
   self.attrs = dict_value(attrs)
   self.lastmod = resolve1(self.attrs.get('LastModified'))
   self.resources = resolve1(self.attrs['Resources'])
   self.mediabox = resolve1(self.attrs['MediaBox'])
   if 'CropBox' in self.attrs:
     self.cropbox = resolve1(self.attrs['CropBox'])
   else:
     self.cropbox = self.mediabox
   self.rotate = self.attrs.get('Rotate', 0)
   self.annots = self.attrs.get('Annots')
   self.beads = self.attrs.get('B')
   if 'Contents' in self.attrs:
     contents = resolve1(self.attrs['Contents'])
   else:
     contents = []
   if not isinstance(contents, list):
     contents = [ contents ]
   self.contents = contents
   return
示例#39
0
 def __validateField(self, field, id):
     # 6.6.1, 6.6.2, 6.9
     if field.get("AA") != None:
         self.__write("Field dictionary " + str(id) + " contains AA entry")
     for f in list_value(field.get("Kids")):
         self.__validateField(dict_value(f), f.objid)
示例#40
0
 def __validatePattern(self, dict, id):
     if num_value(dict.get("PatternType")) == 2:
         self.__validateGraphicsStateParameterDictionary(
             dict_value(dict.get("ExtGState")),
             "in " + str(id) + " pattern")
示例#41
0
  def do_keyword(self, pos, token):
    if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
      self.add_results(*self.pop(1))
      return
    if token is self.KEYWORD_ENDOBJ:
      self.add_results(*self.pop(4))
      return

    if token is self.KEYWORD_R:
      # reference to indirect object
      try:
        ((_,objid), (_,genno)) = self.pop(2)
        (objid, genno) = (int(objid), int(genno))
        obj = PDFObjRef(self.doc, objid, genno)
        self.push((pos, obj))
      except PSSyntaxError:
        pass
      return

    if token is self.KEYWORD_STREAM:
      # stream object
      ((_,dic),) = self.pop(1)
      dic = dict_value(dic)
      try:
        objlen = int_value(dic['Length'])
      except KeyError:
        if STRICT:
          raise PDFSyntaxError('/Length is undefined: %r' % dic)
        objlen = 0
      self.seek(pos)
      try:
        (_, line) = self.nextline()  # 'stream'
      except PSEOF:
        if STRICT:
          raise PDFSyntaxError('Unexpected EOF')
        return
      pos += len(line)
      self.fp.seek(pos)
      data = self.fp.read(objlen)
      self.seek(pos+objlen)
      while 1:
        try:
          (linepos, line) = self.nextline()
        except PSEOF:
          if STRICT:
            raise PDFSyntaxError('Unexpected EOF')
          break
        if 'endstream' in line:
          i = line.index('endstream')
          objlen += i
          data += line[:i]
          break
        objlen += len(line)
        data += line
      self.seek(pos+objlen)
      if 1 <= self.debug:
        print >>stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \
              (pos, objlen, dic, data[:10])
      obj = PDFStream(dic, data, self.doc.decipher)
      self.push((pos, obj))
      return

    # others
    self.push((pos, token))
    return
示例#42
0
    def __initializePTree(self, doc):
        self.__ptree.label = "Document"
        i = 1
        for p in doc.get_pages():
            child = PTree()
            child.label = "Page " + str(i)
            self.__pagenos.setdefault(i, p.pageid)
            i += 1
            child.data = p.pageid
            self.__ptree.children.append(child)
            child.parent = self.__ptree
            fonts = dict_value(p.resources.get("Font"))
            images = dict_value(p.resources.get("XObject"))
            #print images
            for (fontid, spec) in fonts.iteritems():
                # TODO: I czy tu zawsze bedzie referencja?
                objid = spec.objid
                spec = dict_value(spec)
                child2 = PTree()
                child2.label = "Font " + str(fontid)
                child2.data = Font.new(spec,
                                       None,
                                       p.pageid,
                                       child2,
                                       gui=self.__gui,
                                       map=self.__map)
                #print spec
                assert (child2.data.name != None)
                child.children.append(child2)
                child2.parent = child
            maskMap = {}
            masks = []

            def __isMask(spec):
                spec = stream_value(spec)
                if spec.get("ImageMask") == None:
                    return False
                else:
                    #print "else", num_value(spec.get("Mask"))
                    return num_value(spec.get("ImageMask")) == 1

            def __hasMask(spec):
                if stream_value(spec).get("Mask") == None:
                    #print "false"
                    return False
                elif stream_value2(stream_value(spec).get("Mask")) != None:
                    #print "true"
                    # TODO: NOTE pdfminer nie obsluguje genno
                    maskMap.setdefault(
                        stream_value(spec).get("Mask").objid, spec.objid)
                    #print stream_value(spec).get("Mask").objid, spec.objid
                else:
                    #print "else"
                    return False

            for (objname, spec) in images.iteritems():
                #print spec
                # TODO: I czy tu zawsze bedzie referencja?
                objid = spec.objid
                isMask = False
                if __isMask(spec):
                    isMask = True
                spec = stream_value(spec)
                __hasMask(spec)
                if literal_name(spec.get("Subtype")) == "Image":
                    #print objid
                    child2 = PTree()
                    child2.label = "Image " + str(objname)
                    child2.data = (spec, i - 1, objid, 0)
                    child.children.append(
                        child2)  # TODO: NOTE pdfminer nie wspiera genno
                    child2.parent = child
                    if isMask:
                        masks.append(child2)
            for mask in masks:
                (a, b, c, d) = mask.data
                objid = maskMap.get(c)
                if objid != None:
                    #print c, objid
                    mask.data = (a, b, objid, d)
示例#43
0
 def getValue(self, props, key):
     try:
         return props.get(key) # slownik w contencie
     except AttributeError: # slownik w resource'ach
         dict = self.resources.get("Properties").get(literal_name(props))
         return dict_value(dict).get(key)