def __hasMask(spec): if stream_value(spec).get("Mask") == None: #print "false" return False elif stream_value2(stream_value(spec).get("Mask")) != None: #print "true" # TODO: NOTE pdfminer nie obsluguje genno maskMap.setdefault( stream_value(spec).get("Mask").objid, spec.objid) #print stream_value(spec).get("Mask").objid, spec.objid else: #print "else" return False
class PDFCIDFont(PDFFont): def __init__(self, rsrc, spec): try: self.basefont = literal_name(spec['BaseFont']) except KeyError: if STRICT: raise PDFFontError('BaseFont is missing') self.basefont = 'unknown' self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {})) self.cidcoding = '%s-%s' % (self.cidsysteminfo.get( 'Registry', 'unknown'), self.cidsysteminfo.get('Ordering', 'unknown')) try: name = literal_name(spec['Encoding']) except KeyError: if STRICT: raise PDFFontError('Encoding is unspecified') name = 'unknown' try: self.cmap = rsrc.get_cmap(name, strict=STRICT) except CMapDB.CMapNotFound, e: raise PDFFontError(e) try: descriptor = dict_value(spec['FontDescriptor']) except KeyError: if STRICT: raise PDFFontError('FontDescriptor is missing') descriptor = {} ttf = None if 'FontFile2' in descriptor: self.fontfile = stream_value(descriptor.get('FontFile2')) ttf = TrueTypeFont(self.basefont, StringIO(self.fontfile.get_data())) self.ucs2_cmap = None if 'ToUnicode' in spec: strm = stream_value(spec['ToUnicode']) self.ucs2_cmap = CMap() CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run() elif self.cidcoding == 'Adobe-Identity': if ttf: try: self.ucs2_cmap = ttf.create_cmap() except TrueTypeFont.CMapNotFound: pass else: try: self.ucs2_cmap = rsrc.get_cmap('%s-UCS2' % self.cidcoding, strict=STRICT) except CMapDB.CMapNotFound, e: raise PDFFontError(e)
def dumppdf(outfp: TextIO, fname: str, objids: Iterable[int], pagenos: Container[int], password: str = '', dumpall: bool = False, codec: Optional[str] = None, extractdir: Optional[str] = None, show_fallback_xref: bool = False) -> None: fp = open(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser, password) if objids: for objid in objids: obj = doc.getobj(objid) dumpxml(outfp, obj, codec=codec) if pagenos: for (pageno, page) in enumerate(PDFPage.create_pages(doc)): if pageno in pagenos: if codec: for obj in page.contents: obj = stream_value(obj) dumpxml(outfp, obj, codec=codec) else: dumpxml(outfp, page.attrs) if dumpall: dumpallobjs(outfp, doc, codec, show_fallback_xref) if (not objids) and (not pagenos) and (not dumpall): dumptrailers(outfp, doc, show_fallback_xref) fp.close() if codec not in ('raw', 'binary'): outfp.write('\n') return
def dumppdf(outfp, fp, objids, pagenos, password='', dumpall=False, codec=None): doc = PDFDocument() parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize(password) if objids: for objid in objids: obj = doc.getobj(objid) dumpxml(outfp, obj, codec=codec) if pagenos: for (pageno, page) in enumerate(doc.get_pages()): if pageno in pagenos: if codec: for obj in page.contents: obj = stream_value(obj) dumpxml(outfp, obj, codec=codec) else: dumpxml(outfp, page.attrs) if dumpall: dumpallobjs(outfp, doc, codec=codec) if (not objids) and (not pagenos) and (not dumpall): dumptrailers(outfp, doc) fp.close() if codec not in ('raw', 'binary'): outfp.write('\n')
def dumppdf(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): fp = file(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser, password) if objids: for objid in objids: obj = doc.getobj(objid) dumpxml(outfp, obj, codec=codec) if pagenos: for (pageno,page) in enumerate(PDFPage.create_pages(doc)): if pageno in pagenos: if codec: for obj in page.contents: obj = stream_value(obj) dumpxml(outfp, obj, codec=codec) else: dumpxml(outfp, page.attrs) if dumpall: dumpallobjs(outfp, doc, codec=codec) if (not objids) and (not pagenos) and (not dumpall): dumptrailers(outfp, doc) fp.close() if codec not in ('raw','binary'): outfp.write('\n') return
def do_Do(self, xobjid): # the base of this function is basically copy-pasted from ancestor; unfortunately, I found no better solution xobjid = literal_name(xobjid) try: xobj = stream_value(self.xobjmap[xobjid]) except KeyError: if STRICT: raise PDFInterpreterError("Undefined xobject id: %r" % xobjid) return if self.debug: logging.info("Processing xobj: %r" % xobj) subtype = xobj.get("Subtype") if subtype is LITERAL_FORM and "BBox" in xobj: interpreter = self.dup() interpreter.is_first_level_call = None bbox = list_value(xobj["BBox"]) matrix = list_value(xobj.get("Matrix", MATRIX_IDENTITY)) # According to PDF reference 1.7 section 4.9.1, XObjects in # earlier PDFs (prior to v1.2) use the page's Resources entry # instead of having their own Resources entry. resources = dict_value(xobj.get("Resources")) or self.resources.copy() self.device.begin_figure(xobjid, bbox, matrix) interpreter.render_contents(resources, [xobj], ctm=mult_matrix(matrix, self.ctm)) self.device.end_figure(xobjid) # for (k,v) in interpreter.text_lines.iteritems(): # self.text_sequences[k + self.keyword_count] = v self.keyword_count += interpreter.keyword_count print "Included %i keywords" % interpreter.keyword_count else: # ignored xobject type. pass return
def dumppdf(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None): doc = PDFDocument() fp = file(fname, 'rb') parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize(password) if objids: for objid in objids: obj = doc.getobj(objid) dumpxml(outfp, obj, codec=codec) if pagenos: for (pageno,page) in enumerate(doc.get_pages()): if pageno in pagenos: if codec: for obj in page.contents: obj = stream_value(obj) dumpxml(outfp, obj, codec=codec) else: dumpxml(outfp, page.attrs) if dumpall: dumpallobjs(outfp, doc, codec=codec) if (not objids) and (not pagenos) and (not dumpall): dumptrailers(outfp, doc) fp.close() if codec not in ('raw','binary'): outfp.write('\n') return
def dumppdf(fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): fp = file(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser, password) res = "" if objids: for objid in objids: obj = doc.getobj(objid) res += dumpxml(obj, codec=codec) if pagenos: for (pageno,page) in enumerate(PDFPage.create_pages(doc)): if pageno in pagenos: if codec: for obj in page.contents: obj = stream_value(obj) res += dumpxml( obj, codec=codec) else: res += dumpxml(page.attrs) #print "before dumpall" if dumpall: res += dumpallobjs( doc, codec=codec) #print "after dumpall" if (not objids) and (not pagenos) and (not dumpall): res += dumptrailers( doc) fp.close() if codec not in ('raw','binary'): res += '\n' #print "end proc" return res
def __isMask(spec): spec = stream_value(spec) if spec.get("ImageMask") == None: return False else: #print "else", num_value(spec.get("Mask")) return num_value(spec.get("ImageMask")) == 1
def dumppdf(outfp, fname, objids, pagenos, password=b'', dumpall=False, mode=None, extractdir=None): with open(fname, 'rb') as fp: parser = PDFParser(fp) doc = PDFDocument(parser, password) if objids: for objid in objids: obj = doc.getobj(objid) dumpxml(outfp, obj, mode=mode) if pagenos: for (pageno,page) in enumerate(PDFPage.create_pages(doc)): if pageno in pagenos: if mode is not None: for obj in page.contents: obj = stream_value(obj) dumpxml(outfp, obj, mode=mode) else: dumpxml(outfp, page.attrs) if dumpall: dumpallobjs(outfp, doc, mode=mode) if (not objids) and (not pagenos) and (not dumpall): dumptrailers(outfp, doc) if mode not in ('raw','binary'): outfp.write('\n') return
def dumppdf(outfp, fname, objids, pagenos, password="", dumpall=False, codec=None, extractdir=None): fp = file(fname, "rb") parser = PDFParser(fp) doc = PDFDocument(parser, password) if objids: for objid in objids: obj = doc.getobj(objid) dumpxml(outfp, obj, codec=codec) if pagenos: for (pageno, page) in enumerate(PDFPage.create_pages(doc)): if pageno in pagenos: if codec: for obj in page.contents: obj = stream_value(obj) dumpxml(outfp, obj, codec=codec) else: dumpxml(outfp, page.attrs) if dumpall: dumpallobjs(outfp, doc, codec=codec) if (not objids) and (not pagenos) and (not dumpall): dumptrailers(outfp, doc) fp.close() if codec not in ("raw", "binary"): outfp.write("\n") return
def do_Do(self, xobjid): xobjid = literal_name(xobjid) try: xobj = stream_value(self.xobjmap[xobjid]) except KeyError: if STRICT: raise PDFInterpreterError('Undefined xobject id: %r' % xobjid) return if 1 <= self.debug: print >>stderr, 'Processing xobj: %r' % xobj subtype = xobj.dic.get('Subtype') if subtype is LITERAL_FORM and 'BBox' in xobj.dic: interpreter = self.dup() bbox = list_value(xobj.dic['BBox']) matrix = list_value(xobj.dic.get('Matrix', MATRIX_IDENTITY)) self.device.begin_figure(xobjid, bbox, matrix) interpreter.render_contents(dict_value(xobj.dic.get('Resources')), [xobj], ctm=mult_matrix(matrix, self.ctm)) self.device.end_figure(xobjid) elif subtype is LITERAL_IMAGE and 'Width' in xobj.dic and 'Height' in xobj.dic: self.device.begin_figure(xobjid, (0,0,1,1), MATRIX_IDENTITY) (w,h) = (xobj.dic['Width'], xobj.dic['Height']) self.device.render_image(xobj, (w,h)) self.device.end_figure(xobjid) else: # unsupported xobject type. pass return
def get_fontfile(self): desc = self.descriptor ff_key = None for key in ["FontFile", "FontFile2", "FontFile3"]: if key in desc: ff_key = key break if ff_key is not None: return stream_value(desc[ff_key]).get_data()
def fillfp(self): if not self.fp: if self.istream < len(self.streams): strm = stream_value(self.streams[self.istream]) self.istream += 1 else: raise PSEOF('Unexpected EOF, file truncated?') self.fp = StringIO(strm.get_data()) return
def get_colorspace(spec): if isinstance(spec, list): name = literal_name(spec[0]) else: name = literal_name(spec) if name == 'ICCBased' and isinstance(spec, list) and 2 <= len(spec): return PDFColorSpace(name, stream_value(spec[1]).dic['N']) elif name == 'DeviceN' and isinstance(spec, list) and 2 <= len(spec): return PDFColorSpace(name, len(list_value(spec[1]))) else: return PREDEFINED_COLORSPACE[name]
def __init__(self, descriptor, widths, spec): # Font encoding is specified either by a name of # built-in encoding or a dictionary that describes # the differences. if 'Encoding' in spec: encoding = resolve1(spec['Encoding']) else: encoding = LITERAL_STANDARD_ENCODING if isinstance(encoding, dict): name = literal_name(encoding.get('BaseEncoding', LITERAL_STANDARD_ENCODING)) diff = list_value(encoding.get('Differences', None)) self.encoding = EncodingDB.get_encoding(name, diff) else: self.encoding = EncodingDB.get_encoding(literal_name(encoding)) self.ucs2_cmap = None if 'ToUnicode' in spec: strm = stream_value(spec['ToUnicode']) self.ucs2_cmap = CMap() CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run() PDFFont.__init__(self, descriptor, widths) return
def __init__(self, descriptor, widths, spec): # Font encoding is specified either by a name of # built-in encoding or a dictionary that describes # the differences. if 'Encoding' in spec: encoding = resolve1(spec['Encoding']) else: encoding = LITERAL_STANDARD_ENCODING if isinstance(encoding, dict): name = literal_name( encoding.get('BaseEncoding', LITERAL_STANDARD_ENCODING)) diff = list_value(encoding.get('Differences', None)) self.encoding = EncodingDB.get_encoding(name, diff) else: self.encoding = EncodingDB.get_encoding(literal_name(encoding)) self.ucs2_cmap = None if 'ToUnicode' in spec: strm = stream_value(spec['ToUnicode']) self.ucs2_cmap = CMap() CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run() PDFFont.__init__(self, descriptor, widths) return
def getobj(self, objid): if not self.ready: raise PDFException('PDFDocument not initialized') #assert self.xrefs if 2 <= self.debug: print >>stderr, 'getobj: objid=%r' % (objid) if objid in self.objs: genno = 0 obj = self.objs[objid] else: for xref in self.xrefs: try: (strmid, index) = xref.getpos(objid) break except KeyError: pass else: if STRICT: raise PDFSyntaxError('Cannot locate objid=%r' % objid) return None if strmid: stream = stream_value(self.getobj(strmid)) if stream.dic.get('Type') is not LITERAL_OBJSTM: if STRICT: raise PDFSyntaxError('Not a stream object: %r' % stream) try: n = stream.dic['N'] except KeyError: if STRICT: raise PDFSyntaxError('N is not defined: %r' % stream) n = 0 if strmid in self.parsed_objs: objs = self.parsed_objs[strmid] else: parser = PDFObjStrmParser(self, stream.get_data()) objs = [] try: while 1: (_,obj) = parser.nextobject() objs.append(obj) except PSEOF: pass self.parsed_objs[strmid] = objs genno = 0 i = n*2+index try: obj = objs[i] except IndexError: raise PDFSyntaxError('Invalid object number: objid=%r' % (objid)) if isinstance(obj, PDFStream): obj.set_objid(objid, 0) else: self.parser.seek(index) (_,objid1) = self.parser.nexttoken() # objid (_,genno) = self.parser.nexttoken() # genno (_,kwd) = self.parser.nexttoken() # #### hack around malformed pdf files # assert objid1 == objid, (objid, objid1) if objid1 != objid: x = [] while kwd is not self.KEYWORD_OBJ: (_,kwd) = self.parser.nexttoken() x.append(kwd) if x: objid1 = x[-2] genno = x[-1] # #### end hack around malformed pdf files if kwd is not self.KEYWORD_OBJ: raise PDFSyntaxError('Invalid object spec: offset=%r' % index) (_,obj) = self.parser.nextobject() if isinstance(obj, PDFStream): obj.set_objid(objid, genno) if 2 <= self.debug: print >>stderr, 'register: objid=%r: %r' % (objid, obj) self.objs[objid] = obj if self.decipher: obj = decipher_all(self.decipher, objid, genno, obj) return obj
def getobj(self, objid): if not self.ready: raise PDFException('PDFDocument not initialized') #assert self.xrefs if 2 <= self.debug: print >> stderr, 'getobj: objid=%r' % (objid) if objid in self.objs: genno = 0 obj = self.objs[objid] else: for xref in self.xrefs: try: (strmid, index) = xref.getpos(objid) break except KeyError: pass else: if STRICT: raise PDFSyntaxError('Cannot locate objid=%r' % objid) return None if strmid: stream = stream_value(self.getobj(strmid)) if stream.dic.get('Type') is not LITERAL_OBJSTM: if STRICT: raise PDFSyntaxError('Not a stream object: %r' % stream) try: n = stream.dic['N'] except KeyError: if STRICT: raise PDFSyntaxError('N is not defined: %r' % stream) n = 0 if strmid in self.parsed_objs: objs = self.parsed_objs[strmid] else: parser = PDFObjStrmParser(self, stream.get_data()) objs = [] try: while 1: (_, obj) = parser.nextobject() objs.append(obj) except PSEOF: pass self.parsed_objs[strmid] = objs genno = 0 i = n * 2 + index try: obj = objs[i] except IndexError: raise PDFSyntaxError('Invalid object number: objid=%r' % (objid)) if isinstance(obj, PDFStream): obj.set_objid(objid, 0) else: self.parser.seek(index) (_, objid1) = self.parser.nexttoken() # objid (_, genno) = self.parser.nexttoken() # genno (_, kwd) = self.parser.nexttoken() # #### hack around malformed pdf files # assert objid1 == objid, (objid, objid1) if objid1 != objid: x = [] while kwd is not self.KEYWORD_OBJ: (_, kwd) = self.parser.nexttoken() x.append(kwd) if x: objid1 = x[-2] genno = x[-1] # #### end hack around malformed pdf files if kwd is not self.KEYWORD_OBJ: raise PDFSyntaxError('Invalid object spec: offset=%r' % index) (_, obj) = self.parser.nextobject() if isinstance(obj, PDFStream): obj.set_objid(objid, genno) if 2 <= self.debug: print >> stderr, 'register: objid=%r: %r' % (objid, obj) self.objs[objid] = obj if self.decipher: obj = decipher_all(self.decipher, objid, genno, obj) return obj
def __initializePTree(self, doc): self.__ptree.label = "Document" i = 1 for p in doc.get_pages(): child = PTree() child.label = "Page " + str(i) self.__pagenos.setdefault(i, p.pageid) i += 1 child.data = p.pageid self.__ptree.children.append(child) child.parent = self.__ptree fonts = dict_value(p.resources.get("Font")) images = dict_value(p.resources.get("XObject")) #print images for (fontid, spec) in fonts.iteritems(): # TODO: I czy tu zawsze bedzie referencja? objid = spec.objid spec = dict_value(spec) child2 = PTree() child2.label = "Font " + str(fontid) child2.data = Font.new(spec, None, p.pageid, child2, gui=self.__gui, map=self.__map) #print spec assert (child2.data.name != None) child.children.append(child2) child2.parent = child maskMap = {} masks = [] def __isMask(spec): spec = stream_value(spec) if spec.get("ImageMask") == None: return False else: #print "else", num_value(spec.get("Mask")) return num_value(spec.get("ImageMask")) == 1 def __hasMask(spec): if stream_value(spec).get("Mask") == None: #print "false" return False elif stream_value2(stream_value(spec).get("Mask")) != None: #print "true" # TODO: NOTE pdfminer nie obsluguje genno maskMap.setdefault( stream_value(spec).get("Mask").objid, spec.objid) #print stream_value(spec).get("Mask").objid, spec.objid else: #print "else" return False for (objname, spec) in images.iteritems(): #print spec # TODO: I czy tu zawsze bedzie referencja? objid = spec.objid isMask = False if __isMask(spec): isMask = True spec = stream_value(spec) __hasMask(spec) if literal_name(spec.get("Subtype")) == "Image": #print objid child2 = PTree() child2.label = "Image " + str(objname) child2.data = (spec, i - 1, objid, 0) child.children.append( child2) # TODO: NOTE pdfminer nie wspiera genno child2.parent = child if isMask: masks.append(child2) for mask in masks: (a, b, c, d) = mask.data objid = maskMap.get(c) if objid != None: #print c, objid mask.data = (a, b, objid, d)
def execute1(self, stream): strmdata = stream_value(stream).get_data() prevpos = 0 fulltokenlist = [] try: PDFContentParser.BUFSIZ = 20*1024*1024 parser = MyPDFContentParser([stream]) parser.BUFSIZ = 20*1024*1024 except PSEOF: return while 1: try: (pos, obj) = parser.nextobject() except PSEOF: break if isinstance(obj, PSKeyword): inpath = False firstpath = False ispath = False name = keyword_name(obj) if len(self.curpath) > 0: inpath = True if name in ['m','l','c','v','y','h','re', 'S','f','F','f*','F*','B','B*','b','b*','n']: ispath = True if not inpath: firstpath=True inpath = True curpos = pos + len(name) if ispath: if firstpath: fulltokenlist.append({'name':name, 'startpos':prevpos, 'endpos':curpos, 'view':[True]*self.nboxes}) else: fulltokenlist[-1]['endpos'] = curpos else: fulltokenlist.append({'name':name, 'startpos':prevpos, 'endpos':curpos, 'view':[True]*self.nboxes}) prevpos = curpos method = 'do_%s' % name.replace('*', '_a').replace('"', '_w').replace("'", '_q') if hasattr(self, method): func = getattr(self, method) nargs = func.func_code.co_argcount-1 if nargs: args = self.pop(nargs) if 2 <= self.debug: print >>sys.stderr, 'exec: %s %r' % (name, args) if len(args) == nargs: res = func(*args) if not(res == None or False not in res): fulltokenlist[-1]['view'] = res else: if 2 <= self.debug: print >>sys.stderr, 'exec: %s %s' % (method, name) res = func() if not(res == None or False not in res): fulltokenlist[-1]['view']=res else: pass if STRICT: raise PDFInterpreterError('Unknown operator: %r' % name) else: self.push(obj) for j in range(self.nboxes if not self.exclude else 1): tokenlist = [a for a in fulltokenlist if a['view'][j]==True] tokenlist = [a for i,a in enumerate(tokenlist) if not (a['name']=='Tf' and i<len(tokenlist)-2 and tokenlist[i+1]['name']=='Td' and tokenlist[i+2]['name']=='Tf')] for a in tokenlist: self.filteredstreams[j].write(strmdata[a['startpos']:a['endpos']]) return
def setColourSpace(self, cs): #self.__control.SetInsertionPoint(0) #self.__control.Clear() #self.__object = cs if list_value2(cs) != None: colourSpace = literal_name(list_value(cs)[0]) self.__control.WriteText("Type: " + colourSpace + "\n") if colourSpace == "ICCBased": param = stream_value(list_value(cs)[1]) tmpDict = {} if param.get("N") != None: tmpDict.setdefault("N", param.get("N")) if param.get("Range") != None: tmpDict.setdefault("Range", param.get("Range")) self.__control.WriteText(str(tmpDict) + "\n") if list_value2(param.get("Alternate")) == None: if param.get("Alternate") != None: self.__control.WriteText( "Alternate color space: " + literal_name(param.get("Alternate")) + "\n") if param.get("Alternate") != None: self.__control.BeginURL("0") self.__control.BeginTextColour("#0000ff") self.__control.BeginUnderline() self.__control.WriteText("Alternate color space\n") self.__control.EndURL() self.__control.EndUnderline() self.__control.EndTextColour() self.__childObject = param.get("Alternate") self.__control.WriteText("ICC profile: " + hexdump(param.get_data()) + "\n") elif colourSpace in ["CalGray", "CalRGB", "Lab"]: self.__control.WriteText( str(dict_value(list_value(cs)[1])) + "\n") elif colourSpace == "Indexed": self.__control.WriteText(colourSpace + "\n") self.__control.WriteText("Hival: " + str(list_value(cs)[2]) + "\n") self.__control.WriteText("Lookup: " + str(list_value(cs)[3]) + "\n") if list_value2(list_value(cs)[3]) == None: self.__control.WriteText("Base color space: " + literal_name(list_value(cs)[3]) + "n") else: self.__control.BeginURL("0") self.__control.BeginTextColour("#0000ff") self.__control.BeginUnderline() self.__control.Writetext("Base color space\n") self.__control.EndURL() self.__control.EndUnderline() self.__control.EndTextColour() self.__childObject = list_value(cs)[3] elif colourSpace in ["Separation", "DeviceN"]: # TODO: E implementacja przestrzeni kolorantow w atrybutach DeviceN self.__control.WriteText(colourSpace + "\n") self.__control.WriteText("Names: " + str(list_value(cs)[1]) + "\n") self.__control.WriteText("Tint transform : " + str(list_value(cs)[3]) + "\n") if list_value2(list_value(cs)[2]) == None: self.__control.WriteText("Alternate color space: " + literal_name(list_value(cs)[2]) + "\n") else: self.__control.BeginURL("0") self.__control.BeginTextColour("#0000ff") self.__control.BeginUnderline() self.__control.Writetext("Alternate color space\n") self.__control.EndURL() self.__control.EndUnderline() self.__control.EndTextColour() self.__childObject = list_value(cs)[2] self.__control.ShowPosition(0)