예제 #1
0
    def __init__(self, doc, pageid, attrs):
        """Initialize a page object.

        doc: a PDFDocument object.
        pageid: any Python object that can uniquely identify the page.
        attrs: a dictionary of page attributes.
        """
        self.doc = doc
        self.pageid = pageid
        self.attrs = dict_value(attrs)
        self.lastmod = resolve1(self.attrs.get("LastModified"))
        self.resources = resolve1(self.attrs["Resources"])
        self.mediabox = resolve1(self.attrs["MediaBox"])
        if "CropBox" in self.attrs:
            self.cropbox = resolve1(self.attrs["CropBox"])
        else:
            self.cropbox = self.mediabox
        self.rotate = (int_value(self.attrs.get("Rotate", 0)) + 360) % 360
        self.annots = self.attrs.get("Annots")
        self.beads = self.attrs.get("B")
        if "Contents" in self.attrs:
            contents = resolve1(self.attrs["Contents"])
        else:
            contents = []
        if not isinstance(contents, list):
            contents = [contents]
        self.contents = contents
        return
예제 #2
0
파일: pdfparser.py 프로젝트: xarg/pdfminer
 def __init__(self, doc, pageid, attrs):
     """Initialize a page object.
     
     doc: a PDFDocument object.
     pageid: any Python object that can uniquely identify the page.
     attrs: a dictionary of page attributes.
     """
     self.doc = doc
     self.pageid = pageid
     self.attrs = dict_value(attrs)
     self.lastmod = resolve1(self.attrs.get('LastModified'))
     self.resources = resolve1(self.attrs['Resources'])
     self.mediabox = resolve1(self.attrs['MediaBox'])
     if 'CropBox' in self.attrs:
         self.cropbox = resolve1(self.attrs['CropBox'])
     else:
         self.cropbox = self.mediabox
     self.rotate = (self.attrs.get('Rotate', 0)+360) % 360
     self.annots = self.attrs.get('Annots')
     self.beads = self.attrs.get('B')
     if 'Contents' in self.attrs:
         contents = resolve1(self.attrs['Contents'])
     else:
         contents = []
     if not isinstance(contents, list):
         contents = [ contents ]
     self.contents = contents
     return
예제 #3
0
 def init_resources(self, resources):
     self.fontmap = {}
     self.xobjmap = {}
     self.csmap = PREDEFINED_COLORSPACE.copy()
     if not resources: return
     def get_colorspace(spec):
         if isinstance(spec, list):
             name = literal_name(spec[0])
         else:
             name = literal_name(spec)
         if name == 'ICCBased' and isinstance(spec, list) and 2 <= len(spec):
             return PDFColorSpace(name, stream_value(spec[1])['N'])
         elif name == 'DeviceN' and isinstance(spec, list) and 2 <= len(spec):
             return PDFColorSpace(name, len(list_value(spec[1])))
         else:
             return PREDEFINED_COLORSPACE[name]
     for (k,v) in dict_value(resources).iteritems():
         if 1 <= self.debug:
             print >>stderr, 'Resource: %r: %r' % (k,v)
         if k == 'Font':
             for (fontid,spec) in dict_value(v).iteritems():
                 objid = None
                 if isinstance(spec, PDFObjRef):
                     objid = spec.objid
                 spec = dict_value(spec)
                 self.fontmap[fontid] = self.rsrc.get_font(objid, spec)
         elif k == 'ColorSpace':
             for (csid,spec) in dict_value(v).iteritems():
                 self.csmap[csid] = get_colorspace(resolve1(spec))
         elif k == 'ProcSet':
             self.rsrc.get_procset(list_value(v))
         elif k == 'XObject':
             for (xobjid,xobjstrm) in dict_value(v).iteritems():
                 self.xobjmap[xobjid] = xobjstrm
     return
예제 #4
0
 def init_resources(self, resources):
     self.fontmap = {}
     self.xobjmap = {}
     self.csmap = PREDEFINED_COLORSPACE.copy()
     if not resources: return
     def get_colorspace(spec):
         if isinstance(spec, list):
             name = literal_name(spec[0])
         else:
             name = literal_name(spec)
         if name == 'ICCBased' and isinstance(spec, list) and 2 <= len(spec):
             return PDFColorSpace(name, stream_value(spec[1])['N'])
         elif name == 'DeviceN' and isinstance(spec, list) and 2 <= len(spec):
             return PDFColorSpace(name, len(list_value(spec[1])))
         else:
             return PREDEFINED_COLORSPACE[name]
     for (k,v) in dict_value(resources).iteritems():
         if 1 <= self.debug:
             print >>stderr, 'Resource: %r: %r' % (k,v)
         if k == 'Font':
             for (fontid,spec) in dict_value(v).iteritems():
                 objid = None
                 if isinstance(spec, PDFObjRef):
                     objid = spec.objid
                 spec = dict_value(spec)
                 self.fontmap[fontid] = self.rsrc.get_font(objid, spec)
         elif k == 'ColorSpace':
             for (csid,spec) in dict_value(v).iteritems():
                 self.csmap[csid] = get_colorspace(resolve1(spec))
         elif k == 'ProcSet':
             self.rsrc.get_procset(list_value(v))
         elif k == 'XObject':
             for (xobjid,xobjstrm) in dict_value(v).iteritems():
                 self.xobjmap[xobjid] = xobjstrm
     return
예제 #5
0
파일: pdfpage.py 프로젝트: zzma/pdfminer
 def create_widget(obj):
     if 'Subtype' in obj and obj.get('Subtype') is LITERAL_WIDGET:
         wtype = get_widget_type(obj)
         if wtype:
             return LTWidget(list_value(obj.get('Rect')), wtype, str_value(obj.get('T')))
         elif 'Parent' in obj:
             p = resolve1(obj.get('Parent'))
             return LTWidget(list_value(obj.get('Rect')), get_widget_type(p), str_value(p.get('T')))
예제 #6
0
파일: pdfpage.py 프로젝트: zzma/pdfminer
        def find_widgets(obj_list, widgets):
            for an in obj_list:
                try:
                    obj = resolve1(an)
                except PDFObjectNotFound:
                    print 'object not found'
                    print an

                widgets.append(create_widget(obj))
예제 #7
0
파일: pdfparser.py 프로젝트: Big-Data/pypes
 def __init__(self, doc, pageid, attrs):
   self.doc = doc
   self.pageid = pageid
   self.attrs = dict_value(attrs)
   self.lastmod = resolve1(self.attrs.get('LastModified'))
   self.resources = resolve1(self.attrs['Resources'])
   self.mediabox = resolve1(self.attrs['MediaBox'])
   if 'CropBox' in self.attrs:
     self.cropbox = resolve1(self.attrs['CropBox'])
   else:
     self.cropbox = self.mediabox
   self.rotate = self.attrs.get('Rotate', 0)
   self.annots = self.attrs.get('Annots')
   self.beads = self.attrs.get('B')
   if 'Contents' in self.attrs:
     contents = resolve1(self.attrs['Contents'])
   else:
     contents = []
   if not isinstance(contents, list):
     contents = [ contents ]
   self.contents = contents
   return
예제 #8
0
파일: pdffont.py 프로젝트: mcs07/pdfminer
 def __init__(self, descriptor, widths, default_width=None):
     self.descriptor = descriptor
     self.widths = widths
     self.fontname = resolve1(descriptor.get('FontName', 'unknown'))
     if isinstance(self.fontname, PSLiteral):
         self.fontname = literal_name(self.fontname)
     self.flags = int_value(descriptor.get('Flags', 0))
     self.ascent = num_value(descriptor.get('Ascent', 0))
     self.descent = num_value(descriptor.get('Descent', 0))
     self.italic_angle = num_value(descriptor.get('ItalicAngle', 0))
     self.default_width = default_width or num_value(descriptor.get('MissingWidth', 0))
     self.leading = num_value(descriptor.get('Leading', 0))
     self.bbox = list_value(descriptor.get('FontBBox', (0, 0, 0, 0)))
     self.hscale = self.vscale = .001
예제 #9
0
 def __init__(self, descriptor, widths, default_width=None):
     self.descriptor = descriptor
     self.widths = widths
     self.fontname = resolve1(descriptor.get("FontName", "unknown"))
     if isinstance(self.fontname, PSLiteral):
         self.fontname = literal_name(self.fontname)
     self.flags = int_value(descriptor.get("Flags", 0))
     self.ascent = num_value(descriptor.get("Ascent", 0))
     self.descent = num_value(descriptor.get("Descent", 0))
     self.italic_angle = num_value(descriptor.get("ItalicAngle", 0))
     self.default_width = default_width or num_value(descriptor.get("MissingWidth", 0))
     self.leading = num_value(descriptor.get("Leading", 0))
     self.bbox = list_value(descriptor.get("FontBBox", (0, 0, 0, 0)))
     self.hscale = self.vscale = 0.001
     return
예제 #10
0
 def __init__(self, descriptor, widths, default_width=None):
     self.descriptor = descriptor
     self.widths = widths
     self.fontname = resolve1(descriptor.get('FontName', 'unknown'))
     if isinstance(self.fontname, PSLiteral):
         self.fontname = literal_name(self.fontname)
     self.flags = int_value(descriptor.get('Flags', 0))
     self.ascent = num_value(descriptor.get('Ascent', 0))
     self.descent = num_value(descriptor.get('Descent', 0))
     self.italic_angle = num_value(descriptor.get('ItalicAngle', 0))
     self.default_width = default_width or num_value(descriptor.get('MissingWidth', 0))
     self.leading = num_value(descriptor.get('Leading', 0))
     self.bbox = list_value(descriptor.get('FontBBox', (0,0,0,0)))
     self.hscale = self.vscale = .001
     return
예제 #11
0
 def get_font(self, objid, spec):
     if objid and objid in self._cached_fonts:
         font = self._cached_fonts[objid]
     else:
         if 2 <= self.debug:
             print >> sys.stderr, 'get_font: create: objid=%r, spec=%r' % (
                 objid, spec)
         if STRICT:
             if spec['Type'] is not LITERAL_FONT:
                 raise PDFFontError('Type is not /Font')
         # Create a Font object.
         if 'Subtype' in spec:
             subtype = literal_name(spec['Subtype'])
         else:
             if STRICT:
                 raise PDFFontError('Font Subtype is not specified.')
             subtype = 'Type1'
         if subtype in ('Type1', 'MMType1'):
             # Type1 Font
             font = PDFType1Font(self, spec)
         elif subtype == 'TrueType':
             # TrueType Font
             font = PDFTrueTypeFont(self, spec)
         elif subtype == 'Type3':
             # Type3 Font
             font = PDFType3Font(self, spec)
         elif subtype in ('CIDFontType0', 'CIDFontType2'):
             # CID Font
             font = PDFCIDFont(self, spec)
         elif subtype == 'Type0':
             # Type0 Font
             dfonts = list_value(spec['DescendantFonts'])
             assert dfonts
             subspec = dict_value(dfonts[0]).copy()
             for k in ('Encoding', 'ToUnicode'):
                 if k in spec:
                     subspec[k] = resolve1(spec[k])
             font = self.get_font(None, subspec)
         else:
             if STRICT:
                 raise PDFFontError('Invalid Font spec: %r' % spec)
             font = PDFType1Font(self, spec)  # this is so wrong!
         if objid and self.caching:
             self._cached_fonts[objid] = font
     return font
 def get_font(self, objid, spec):
     if objid and objid in self._cached_fonts:
         font = self._cached_fonts[objid]
     else:
         if 2 <= self.debug:
             print >>sys.stderr, 'get_font: create: objid=%r, spec=%r' % (objid, spec)
         if STRICT:
             if spec['Type'] is not LITERAL_FONT:
                 raise PDFFontError('Type is not /Font')
         # Create a Font object.
         if 'Subtype' in spec:
             subtype = literal_name(spec['Subtype'])
         else:
             if STRICT:
                 raise PDFFontError('Font Subtype is not specified.')
             subtype = 'Type1'
         if subtype in ('Type1', 'MMType1'):
             # Type1 Font
             font = PDFType1Font(self, spec)
         elif subtype == 'TrueType':
             # TrueType Font
             font = PDFTrueTypeFont(self, spec)
         elif subtype == 'Type3':
             # Type3 Font
             font = PDFType3Font(self, spec)
         elif subtype in ('CIDFontType0', 'CIDFontType2'):
             # CID Font
             font = PDFCIDFont(self, spec)
         elif subtype == 'Type0':
             # Type0 Font
             dfonts = list_value(spec['DescendantFonts'])
             assert dfonts
             subspec = dict_value(dfonts[0]).copy()
             for k in ('Encoding', 'ToUnicode'):
                 if k in spec:
                     subspec[k] = resolve1(spec[k])
             font = self.get_font(None, subspec)
         else:
             if STRICT:
                 raise PDFFontError('Invalid Font spec: %r' % spec)
             font = PDFType1Font(self, spec) # this is so wrong!
         if objid and self.caching:
             self._cached_fonts[objid] = font
     return font
예제 #13
0
 def get_font(self, objid, spec):
     if objid and objid in self._cached_fonts:
         font = self._cached_fonts[objid]
     else:
         if 2 <= self.debug:
             print >>sys.stderr, "get_font: create: objid=%r, spec=%r" % (objid, spec)
         if STRICT:
             if spec["Type"] is not LITERAL_FONT:
                 raise PDFFontError("Type is not /Font")
         # Create a Font object.
         if "Subtype" in spec:
             subtype = literal_name(spec["Subtype"])
         else:
             if STRICT:
                 raise PDFFontError("Font Subtype is not specified.")
             subtype = "Type1"
         if subtype in ("Type1", "MMType1"):
             # Type1 Font
             font = PDFType1Font(self, spec)
         elif subtype == "TrueType":
             # TrueType Font
             font = PDFTrueTypeFont(self, spec)
         elif subtype == "Type3":
             # Type3 Font
             font = PDFType3Font(self, spec)
         elif subtype in ("CIDFontType0", "CIDFontType2"):
             # CID Font
             font = PDFCIDFont(self, spec)
         elif subtype == "Type0":
             # Type0 Font
             dfonts = list_value(spec["DescendantFonts"])
             assert dfonts
             subspec = dict_value(dfonts[0]).copy()
             for k in ("Encoding", "ToUnicode"):
                 if k in spec:
                     subspec[k] = resolve1(spec[k])
             font = self.get_font(None, subspec)
         else:
             if STRICT:
                 raise PDFFontError("Invalid Font spec: %r" % spec)
             font = PDFType1Font(self, spec)  # this is so wrong!
         if objid and self.caching:
             self._cached_fonts[objid] = font
     return font
예제 #14
0
파일: pdffont.py 프로젝트: mcs07/pdfminer
 def __init__(self, descriptor, widths, spec):
     # Font encoding is specified either by a name of
     # built-in encoding or a dictionary that describes
     # the differences.
     if 'Encoding' in spec:
         encoding = resolve1(spec['Encoding'])
     else:
         encoding = LITERAL_STANDARD_ENCODING
     if isinstance(encoding, dict):
         name = literal_name(encoding.get('BaseEncoding', LITERAL_STANDARD_ENCODING))
         diff = list_value(encoding.get('Differences', None))
         self.cid2unicode = EncodingDB.get_encoding(name, diff)
     else:
         self.cid2unicode = EncodingDB.get_encoding(literal_name(encoding))
     self.unicode_map = None
     if 'ToUnicode' in spec:
         strm = stream_value(spec['ToUnicode'])
         self.unicode_map = FileUnicodeMap()
         CMapParser(self.unicode_map, StringIO(strm.get_data())).run()
     PDFFont.__init__(self, descriptor, widths)
예제 #15
0
 def __init__(self, descriptor, widths, spec):
     # Font encoding is specified either by a name of
     # built-in encoding or a dictionary that describes
     # the differences.
     if 'Encoding' in spec:
         encoding = resolve1(spec['Encoding'])
     else:
         encoding = LITERAL_STANDARD_ENCODING
     if isinstance(encoding, dict):
         name = literal_name(encoding.get('BaseEncoding', LITERAL_STANDARD_ENCODING))
         diff = list_value(encoding.get('Differences', None))
         self.cid2unicode = EncodingDB.get_encoding(name, diff)
     else:
         self.cid2unicode = EncodingDB.get_encoding(literal_name(encoding))
     self.unicode_map = None
     if 'ToUnicode' in spec:
         strm = stream_value(spec['ToUnicode'])
         self.unicode_map = FileUnicodeMap()
         CMapParser(self.unicode_map, StringIO(strm.get_data())).run()
     PDFFont.__init__(self, descriptor, widths)
     return
예제 #16
0
    def init_resources(self, resources):
        self.resources = resources
        self.fontmap = {}
        self.xobjmap = {}
        self.csmap = PREDEFINED_COLORSPACE.copy()
        if not resources:
            return

        def get_colorspace(spec):
            if isinstance(spec, list):
                name = literal_name(spec[0])
            else:
                name = literal_name(spec)
            if name == "ICCBased" and isinstance(spec, list) and 2 <= len(spec):
                return PDFColorSpace(name, stream_value(spec[1])["N"])
            elif name == "DeviceN" and isinstance(spec, list) and 2 <= len(spec):
                return PDFColorSpace(name, len(list_value(spec[1])))
            else:
                return PREDEFINED_COLORSPACE.get(name)

        for (k, v) in dict_value(resources).iteritems():
            if 2 <= self.debug:
                print >>sys.stderr, "Resource: %r: %r" % (k, v)
            if k == "Font":
                for (fontid, spec) in dict_value(v).iteritems():
                    objid = None
                    if isinstance(spec, PDFObjRef):
                        objid = spec.objid
                    spec = dict_value(spec)
                    self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
            elif k == "ColorSpace":
                for (csid, spec) in dict_value(v).iteritems():
                    self.csmap[csid] = get_colorspace(resolve1(spec))
            elif k == "ProcSet":
                self.rsrcmgr.get_procset(list_value(v))
            elif k == "XObject":
                for (xobjid, xobjstrm) in dict_value(v).iteritems():
                    self.xobjmap[xobjid] = xobjstrm
        return
예제 #17
0
파일: pdfinterp.py 프로젝트: mcs07/pdfminer
    def init_resources(self, resources):
        """Prepare the fonts and XObjects listed in the Resource attribute."""
        self.resources = resources
        self.fontmap = {}
        self.xobjmap = {}
        self.csmap = PREDEFINED_COLORSPACE.copy()
        if not resources:
            return

        def get_colorspace(spec):
            if isinstance(spec, list):
                name = literal_name(spec[0])
            else:
                name = literal_name(spec)
            if name == 'ICCBased' and isinstance(spec, list) and 2 <= len(spec):
                return PDFColorSpace(name, stream_value(spec[1])['N'])
            elif name == 'DeviceN' and isinstance(spec, list) and 2 <= len(spec):
                return PDFColorSpace(name, len(list_value(spec[1])))
            else:
                return PREDEFINED_COLORSPACE.get(name)
        for (k, v) in dict_value(resources).iteritems():
            log.debug('Resource: %r: %r', k, v)
            if k == 'Font':
                for (fontid, spec) in dict_value(v).iteritems():
                    objid = None
                    if isinstance(spec, PDFObjRef):
                        objid = spec.objid
                    spec = dict_value(spec)
                    self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
            elif k == 'ColorSpace':
                for (csid, spec) in dict_value(v).iteritems():
                    self.csmap[csid] = get_colorspace(resolve1(spec))
            elif k == 'ProcSet':
                self.rsrcmgr.get_procset(list_value(v))
            elif k == 'XObject':
                for (xobjid, xobjstrm) in dict_value(v).iteritems():
                    self.xobjmap[xobjid] = xobjstrm
예제 #18
0
파일: pdfpage.py 프로젝트: zzma/pdfminer
    def __init__(self, doc, pageid, attrs):
        """Initialize a page object.

        doc: a PDFDocument object.
        pageid: any Python object that can uniquely identify the page.
        attrs: a dictionary of page attributes.
        """
        self.doc = doc
        self.pageid = pageid
        self.attrs = dict_value(attrs)
        self.lastmod = resolve1(self.attrs.get('LastModified'))
        self.resources = resolve1(self.attrs['Resources'])
        self.mediabox = resolve1(self.attrs['MediaBox'])
        if 'CropBox' in self.attrs:
            self.cropbox = resolve1(self.attrs['CropBox'])
        else:
            self.cropbox = self.mediabox
        self.rotate = (int_value(self.attrs.get('Rotate', 0))+360) % 360
        self.annots = list_value(self.attrs.get('Annots'))
        self.widgets = []

        def get_widget_type(obj):
            if 'FT' in obj:
                if obj.get('FT') is LITERAL_TX:
                    return 'text'
                elif obj.get('FT') is LITERAL_CH or obj.get('FT') is LITERAL_BTN:
                    return 'checkbox'
                elif obj.get('FT') is LITERAL_SIG:
                    return 'signature'
                else:
                    return None

            
        def create_widget(obj):
            if 'Subtype' in obj and obj.get('Subtype') is LITERAL_WIDGET:
                wtype = get_widget_type(obj)
                if wtype:
                    return LTWidget(list_value(obj.get('Rect')), wtype, str_value(obj.get('T')))
                elif 'Parent' in obj:
                    p = resolve1(obj.get('Parent'))
                    return LTWidget(list_value(obj.get('Rect')), get_widget_type(p), str_value(p.get('T')))


        def find_widgets(obj_list, widgets):
            for an in obj_list:
                try:
                    obj = resolve1(an)
                except PDFObjectNotFound:
                    print 'object not found'
                    print an

                widgets.append(create_widget(obj))
        
        find_widgets(self.annots, self.widgets)

        self.beads = self.attrs.get('B')
        if 'Contents' in self.attrs:
            contents = resolve1(self.attrs['Contents'])
        else:
            contents = []
        if not isinstance(contents, list):
            contents = [contents]
        self.contents = contents
        return