def process_string(self, ts, array): verbose('SHOW STRING ts: ', ts) verbose('SHOW STRING array: ', array) for obj in array: verbose("processing obj: ", obj) # this comes from TJ, number translates Tm if utils.isnumber(obj): Tj = obj verbose("processing translation: ", Tj) # translating Tm, change tx, ty according to direction if ts.Tf.is_vertical(): tx = 0 ty = self.new_ty(0, Tj, ts.Tfs, 0, ts.Tw) else: tx = self.new_tx(0, Tj, ts.Tfs, 0, ts.Tw, ts.Th) ty = 0 # update Tm accordingly ts.Tm = utils.translate_matrix(ts.Tm, (tx, ty)) # there is an heuristic needed here, not sure what # if -Tj > ts.Tf.char_width('o'): # self.draw_cid(ts, 0, force_space=True) else: verbose("processing string") for cid in ts.Tf.decode(obj): self.draw_cid(ts, cid)
def filterObjs(obj, x, codec=None): if obj is None: return if isinstance(obj, dict): for (k, v) in six.iteritems(obj): if (k == "URI"): x.append(e(v)) filterObjs(v, x) return if isinstance(obj, list): for v in obj: filterObjs(v, x) return if isinstance(obj, (six.string_types, six.binary_type)): return if isinstance(obj, PDFStream): return if isinstance(obj, PDFObjRef): return if isinstance(obj, PSKeyword): return if isinstance(obj, PSLiteral): return if isnumber(obj): return raise TypeError(obj)
def render_string_horizontal(self, seq, matrix, pos, font, fontsize, scaling, charspace, wordspace, rise, dxscale, ncs, graphicstate): (x, y) = pos needcharspace = False h_est = fontsize*matrix[3] # We estimate the size of the font by multiplying the fontsize by the height scaling in the textmatrix. for obj in seq: if utils.isnumber(obj): x -= obj*dxscale needcharspace = True else: for (char, cid) in zip(obj,font.decode(obj)): if needcharspace: x += charspace char_width = self.render_char(utils.translate_matrix(matrix, (x, y)), font, fontsize, scaling, rise, cid, ncs, graphicstate) self.push_char(bytes([char]), char_width, (x, y), h_est, font, matrix) x += char_width if cid == 32 and wordspace: x += wordspace needcharspace = True # Push none to indicate end of rendering. self.push_char(None, 0, (x,y), h_est, font, matrix) return (x, y)
def dumpobj(self, out, obj): if obj is None: out.write(b'null ') return if isinstance(obj, dict): out.write(b'<<') for (k, v) in obj.items(): out.write(b'/%s ' % bytes(k, 'utf-8')) self.dumpobj(out, v) out.write(b'>>') return if isinstance(obj, list): out.write(b'[') for v in obj: self.dumpobj(out, v) out.write(b']') return if isinstance(obj, bytes): out.write(b'(') out.write(obj) out.write(b')') return if isinstance(obj, str): out.write(b'(') out.write(bytes(e(obj), 'utf-8')) out.write(b')') return if isinstance(obj, bool): if obj: out.write(b'true ') else: out.write(b'false ') return if isnumber(obj): if isinstance(obj, float): s = (b'%.5f ' % obj).rstrip(b'0') else: s = b'%d ' % obj out.write(s) return if isinstance(obj, PDFObjRef): out.write(b'%d 0 R ' % (obj.objid)) return if isinstance(obj, PSKeyword): out.write(b'/%s ' % bytes(obj.name, 'utf-8')) return if isinstance(obj, PSLiteral): out.write(b'/%s ' % bytes(obj.name, 'utf-8')) return # if isinstance(obj, PDFStream): raise TypeError(obj)
def dumpxml(out, obj, mode=None): if obj is None: out.write('<null />') return if isinstance(obj, dict): out.write('<dict size="%d">\n' % len(obj)) for (k, v) in obj.items(): out.write('<key>%s</key>\n' % k) out.write('<value>') dumpxml(out, v) out.write('</value>\n') out.write('</dict>') return if isinstance(obj, list): out.write('<list size="%d">\n' % len(obj)) for v in obj: dumpxml(out, v) out.write('\n') out.write('</list>') return if isinstance(obj, bytes): out.write('<string size="%d">%s</string>' % (len(obj), encode(obj))) return if isinstance(obj, PDFStream): if mode == 'raw': out.buffer.write(obj.get_rawdata()) elif mode == 'binary': out.buffer.write(obj.get_data()) else: out.write('<stream>\n<props>\n') dumpxml(out, obj.attrs) out.write('\n</props>\n') if mode == 'text': data = obj.get_data() out.write('<data size="%d">%s</data>\n' % (len(data), encode(data))) out.write('</stream>') return if isinstance(obj, PDFObjRef): out.write('<ref id="%d" />' % obj.objid) return if isinstance(obj, PSKeyword): out.write('<keyword>%s</keyword>' % obj.name) return if isinstance(obj, PSLiteral): out.write('<literal>%s</literal>' % obj.name) return if isnumber(obj): out.write('<number>%s</number>' % obj) return raise TypeError(obj)
def dumpxml(out, obj, codec=None): if obj is None: out.write('<null />') return if isinstance(obj, dict): out.write('<dict size="%d">\n' % len(obj)) for (k,v) in obj.iteritems(): out.write('<key>%s</key>\n' % k) out.write('<value>') dumpxml(out, v) out.write('</value>\n') out.write('</dict>') return if isinstance(obj, list): out.write('<list size="%d">\n' % len(obj)) for v in obj: dumpxml(out, v) out.write('\n') out.write('</list>') return if isinstance(obj, str): out.write('<string size="%d">%s</string>' % (len(obj), e(obj))) return if isinstance(obj, PDFStream): if codec == 'raw': out.write(obj.get_rawdata()) elif codec == 'binary': out.write(obj.get_data()) else: out.write('<stream>\n<props>\n') dumpxml(out, obj.attrs) out.write('\n</props>\n') if codec == 'text': data = obj.get_data() out.write('<data size="%d">%s</data>\n' % (len(data), e(data))) out.write('</stream>') return if isinstance(obj, PDFObjRef): out.write('<ref id="%d" />' % obj.objid) return if isinstance(obj, PSKeyword): out.write('<keyword>%s</keyword>' % obj.name) return if isinstance(obj, PSLiteral): out.write('<literal>%s</literal>' % obj.name) return if isnumber(obj): out.write('<number>%s</number>' % obj) return raise TypeError(obj)
def dumpxml(out, obj, codec=None): if obj is None: out.write("<null />") return if isinstance(obj, dict): out.write('<dict size="%d">\n' % len(obj)) for (k, v) in obj.items(): out.write("<key>%s</key>\n" % k) out.write("<value>") dumpxml(out, v) out.write("</value>\n") out.write("</dict>") return if isinstance(obj, list): out.write('<list size="%d">\n' % len(obj)) for v in obj: dumpxml(out, v) out.write("\n") out.write("</list>") return if isinstance(obj, ((str,), bytes)): out.write('<string size="%d">%s</string>' % (len(obj), e(obj))) return if isinstance(obj, PDFStream): if codec == "raw": out.write(obj.get_rawdata()) elif codec == "binary": out.write(obj.get_data()) else: out.write("<stream>\n<props>\n") dumpxml(out, obj.attrs) out.write("\n</props>\n") if codec == "text": data = obj.get_data() out.write('<data size="%d">%s</data>\n' % (len(data), e(data))) out.write("</stream>") return if isinstance(obj, PDFObjRef): out.write('<ref id="%d" />' % obj.objid) return if isinstance(obj, PSKeyword): out.write("<keyword>%s</keyword>" % obj.name) return if isinstance(obj, PSLiteral): out.write("<literal>%s</literal>" % obj.name) return if isnumber(obj): out.write("<number>%s</number>" % obj) return raise TypeError(obj)
def dumpxml(out, obj, codec=None): if obj is None: out += '<null />' return if isinstance(obj, dict): out += '<dict size="{}">\n'.format(len(obj)) for (k, v) in obj.items(): out += '<key>{}</key>\n'.format(k) out += '<value>' dumpxml(out, v) out += '</value>\n' out += '</dict>' return out if isinstance(obj, list): out += '<list size="{}">\n'.format(len(obj)) for v in obj: dumpxml(out, v) out += '\n' out += '</list>' return out if isinstance(obj, ((str,), bytes)): out += '<string size="{}">{}</string>'.format(len(obj), e(obj)) return out if isinstance(obj, PDFStream): if codec == 'raw': out += obj.get_rawdata() elif codec == 'binary': out += obj.get_data() else: out += '<stream>\n<props>\n' dumpxml(out, obj.attrs) out += '\n</props>\n' if codec == 'text': data = obj.get_data() out += '<data size="{}">{}</data>\n'.format(len(data), e(data)) out += '</stream>' return out if isinstance(obj, PDFObjRef): out += '<ref id="{}" />'.format(obj.objid) return out if isinstance(obj, PSKeyword): out += '<keyword>{}</keyword>'.format(obj.name) return out if isinstance(obj, PSLiteral): out += '<literal>{}</literal>'.format(obj.name) return out if isnumber(obj): out += '<number>{}</number>'.format(obj) return out raise TypeError(obj)
def get_obj_type(obj): if obj is None: return None if isinstance(obj, dict): return 'dict' if isinstance(obj, list): return 'list' if isinstance(obj, str): return 'str' if isinstance(obj, PDFStream): return 'PDFStream' if isinstance(obj, PDFObjRef): return 'PDFObjRef' if isinstance(obj, PSKeyword): return 'PSKeyword' if isinstance(obj, PSLiteral): return 'PSLiteral' if isnumber(obj): return 'number' return 'TypeError'
def dumpxml(obj, codec=None): #print "dumpxml" res = "" if obj is None: res += '<null />' return res if isinstance(obj, dict): #print "dict" res += '<dict size="%' + str(len(obj)) + '">\n' for (k,v) in obj.iteritems(): #print "dict loop" res += '<key>' + k + '</key>\n' res += '<value>' res += dumpxml( v) #print "after v dump" res += '</value>\n' res += '</dict>' #print "return dict" return res if isinstance(obj, list): #print "list" res += '<list size="' + str(len(obj)) + '">\n' for v in obj: #print "before list dump" res += dumpxml(v) #print "after list dump" res += '\n' res += '</list>' return res if isinstance(obj, str): #print "string" res += '<string size="' + str(len(obj)) + '">' + e(obj) + '</string>' return res if isinstance(obj, PDFStream): #print "PDFStream" if codec == 'raw': res += obj.get_rawdata() elif codec == 'binary': res += obj.get_data() else: res += '<stream>\n<props>\n' #print "before dump attrs" res += dumpxml(obj.attrs) #print "after dump attrs" res += '\n</props>\n' if codec == 'text': data = obj.get_data() res += '<data size="' + str(len(data)) + '">' + e(data) + '</data>\n' res += '</stream>' return res if isinstance(obj, PDFObjRef): #print "PDFObjRef" res += '<ref id="' + str(obj.objid) + '" />' return res if isinstance(obj, PSKeyword): #print "PSKeyword" res += '<keyword>' + obj.name + '</keyword>' return res if isinstance(obj, PSLiteral): #print "PSLiteral" res += '<literal>' + obj.name + '</literal>' return res if isnumber(obj): #print "Number " + str(obj) res += '<number>' + str(obj) + '</number>' return res raise TypeError(obj)
def dump(self, obj): res = "" if obj is None: res += '<null />' return res if isinstance(obj, dict): res += '<dict size="%' + str(len(obj)) + '">\n' for (k, v) in obj.iteritems(): k = re.sub(r'\W+', '', k) if k.isdigit() or not k: k = 'xml_creator_' + k res += '<' + k + '>' res += self.dump(v) res += '</' + k + '>\n' res += '</dict>' return res if isinstance(obj, list): res += '<list size="' + str(len(obj)) + '">\n' for v in obj: res += self.dump(v) res += '\n' res += '</list>' return res if isinstance(obj, str): self.check_js(obj) # encode base64 to avoid illegal xml characters res += '<string>' + self.e(obj).encode('base64') + '</string>' return res if isinstance(obj, PDFStream): res += '<stream>\n' try: res += '<props>\n' res += self.dump(obj.attrs) res += '\n</props>\n' data = obj.get_data() self.check_js(str(data)) self.check_swf(str(data)) res += '<data size="' + str(len(data)) + '">' + self.e( data).encode('base64') + '</data>\n' # Throws an exception if the filter is unsupported, etc except Exception as e: # print e.message res += '<StreamException>%s</StreamException>' % str(e) # make sure the tag is closed appropriately res += '</stream>' return res if isinstance(obj, PDFObjRef): res += '<ref id="' + str(obj.objid) + '" />' return res if isinstance(obj, PSKeyword): self.check_js(obj.name) res += '<keyword>' + obj.name + '</keyword>' return res if isinstance(obj, PSLiteral): self.check_js(obj.name) res += '<literal>' + obj.name + '</literal>' return res if isnumber(obj): self.check_js(str(obj)) res += '<number>' + str(obj) + '</number>' return res raise TypeError(obj)
def dumpxml(out, obj, codec=None): if obj is None: out.write('<null />') return if isinstance(obj, dict): out.write('<dict size="%d">\n' % len(obj)) for (k, v) in obj.items(): out.write('<key>%s</key>\n' % k) out.write('<value>') dumpxml(out, v) out.write('</value>\n') out.write('</dict>') return if isinstance(obj, list): out.write('<list size="%d">\n' % len(obj)) for v in obj: dumpxml(out, v) out.write('\n') out.write('</list>') return if isinstance(obj, str): out.write('<string size="%d">%s</string>' % (len(obj), e(obj))) return # added to avoid encoding errors if isinstance(obj, bytes): value = "".join(map(chr, obj)) out.write('<string size="%d">%s</string>' % (len(value), e(value))) return if isinstance(obj, PDFStream): if codec == 'raw': out.write(obj.get_rawdata()) elif codec == 'binary': out.write(obj.get_data()) else: out.write('<stream>\n<props>\n') dumpxml(out, obj.attrs) out.write('\n</props>\n') if codec == 'text': data = obj.get_data() out.write('<data size="%d">%s</data>\n' % (len(data), e(data))) out.write('</stream>') return if isinstance(obj, PDFObjRef): out.write('<ref id="%d" />' % obj.objid) return if isinstance(obj, PSKeyword): out.write('<keyword>%s</keyword>' % obj.name) return if isinstance(obj, PSLiteral): out.write('<literal>%s</literal>' % obj.name) return if isnumber(obj): out.write('<number>%s</number>' % obj) return # raise TypeError(obj) print('Exception') print(obj) print(type(obj))
def dumpxml(out: TextIO, obj: object, codec: Optional[str] = None) -> None: if obj is None: out.write('<null />') return if isinstance(obj, dict): out.write('<dict size="%d">\n' % len(obj)) for (k, v) in obj.items(): out.write('<key>%s</key>\n' % k) out.write('<value>') dumpxml(out, v) out.write('</value>\n') out.write('</dict>') return if isinstance(obj, list): out.write('<list size="%d">\n' % len(obj)) for v in obj: dumpxml(out, v) out.write('\n') out.write('</list>') return if isinstance(obj, (str, bytes)): out.write('<string size="%d">%s</string>' % (len(obj), escape(obj))) return if isinstance(obj, PDFStream): if codec == 'raw': # Bug: writing bytes to text I/O. This will raise TypeError. out.write(obj.get_rawdata()) # type: ignore [arg-type] elif codec == 'binary': # Bug: writing bytes to text I/O. This will raise TypeError. out.write(obj.get_data()) # type: ignore [arg-type] else: out.write('<stream>\n<props>\n') dumpxml(out, obj.attrs) out.write('\n</props>\n') if codec == 'text': data = obj.get_data() out.write('<data size="%d">%s</data>\n' % (len(data), escape(data))) out.write('</stream>') return if isinstance(obj, PDFObjRef): out.write('<ref id="%d" />' % obj.objid) return if isinstance(obj, PSKeyword): # Likely bug: obj.name is bytes, not str out.write('<keyword>%s</keyword>' % obj.name) # type: ignore [str-bytes-safe] return if isinstance(obj, PSLiteral): # Likely bug: obj.name may be bytes, not str out.write('<literal>%s</literal>' % obj.name) # type: ignore [str-bytes-safe] return if isnumber(obj): out.write('<number>%s</number>' % obj) return raise TypeError(obj)
def dump(self, obj): res = "" if obj is None: res += '<null />' return res if isinstance(obj, dict): res += '<dict size="%' + str(len(obj)) + '">\n' for (k,v) in obj.iteritems(): k = re.sub(r'\W+', '', k) if k.isdigit() or not k: k = 'xml_creator_' + k res += '<' + k + '>' res += self.dump(v) res += '</' + k + '>\n' res += '</dict>' return res if isinstance(obj, list): res += '<list size="' + str(len(obj)) + '">\n' for v in obj: res += self.dump(v) res += '\n' res += '</list>' return res if isinstance(obj, str): self.check_js(obj) #encode base64 to avoid illegal xml characters res += '<string>' + self.e(obj).encode('base64') + '</string>' return res if isinstance(obj, PDFStream): res += '<stream>\n' try: res += '<props>\n' res += self.dump(obj.attrs) res += '\n</props>\n' data = obj.get_data() self.check_js(str(data)) self.check_swf(str(data)) res += '<data size="' + str(len(data)) + '">' + self.e(data).encode('base64') + '</data>\n' #Throws an exception if the filter is unsupported, etc except Exception as e: #print e.message res += '<StreamException>%s</StreamException>' % str(e) #make sure the tag is closed appropriately res += '</stream>' return res if isinstance(obj, PDFObjRef): res += '<ref id="' + str(obj.objid) + '" />' return res if isinstance(obj, PSKeyword): self.check_js(obj.name) res += '<keyword>' + obj.name + '</keyword>' return res if isinstance(obj, PSLiteral): self.check_js(obj.name) res += '<literal>' + obj.name + '</literal>' return res if isnumber(obj): self.check_js(str(obj)) res += '<number>' + str(obj) + '</number>' return res raise TypeError(obj)
def show_string(self, ts, array): verbose(ts) sentence = [] word = [] m = (ts.Tfs * ts.Th, 0, 0, ts.Tfs, 0, ts.Trise) applicable_Tm = utils.mult_matrix(m, ts.Tm) (sx, _, _, sy, tx, ty) = applicable_Tm current_state = (sx, sy, tx, ty) if self.last_state == None: self.paragraph = [] verbose('current paragraph becomes=', self.paragraph) elif current_state[0] == self.last_state[0]: verbose('DECISION: grouping the text object to last') else: verbose('DECISION: finalizing the paragraph') key = self.last_state[0] item = self.paragraph_map.get(key, '') if len(item) > 0: item = item = ' ' new_item = ' '.join(self.paragraph) self.paragraph_map[key] = item + new_item self.paragraph = [] verbose('current paragraph becomes=', self.paragraph) self.last_state = current_state for obj in array: verbose("processing obj=", obj) if utils.isnumber(obj): Tj = obj if Tj < WITHIN_WORD_MOVE_LIMIT: verbose("DECISION: new word") sentence.append(''.join(word)) verbose('current sentence becomes=', sentence) word = [] verbose('current word becomes=', word) else: verbose("DECISION: move inside the current word") if ts.Tf.is_vertical(): tx = 0 ty = ((Tj / 1000) * ts.Tfs) else: tx = ((Tj / 1000) * ts.Tfs) * ts.Th ty = 0 ts.Tm = utils.mult_matrix((1, 0, 0, 1, tx, ty), ts.Tm) else: for cid in ts.Tf.decode(obj): verbose("processing cid=", cid) m = (ts.Tfs * ts.Th, 0, 0, ts.Tfs, 0, ts.Trise) applicable_Tm = utils.mult_matrix(m, ts.Tm) if cid == 32: applicable_Tw = ts.Tw sentence.append(''.join(word)) verbose('current sentence becomes=', sentence) word = [] else: try: text = ts.Tf.to_unichr(cid) except PDFUnicodeNotDefined: if MISSING_CHAR: text = MISSING_CHAR else: raise word.append(text) verbose('current word becomes=', word) applicable_Tw = 0 w = ts.Tf.char_width(cid) if ts.Tf.is_vertical(): tx = 0 ty = ((w - 0) * ts.Tfs + ts.Tc + applicable_Tw) else: tx = ((w - 0) * ts.Tfs + ts.Tc + applicable_Tw) * ts.Th ty = 0 ts.Tm = utils.mult_matrix((1, 0, 0, 1, tx, ty), ts.Tm) if len(word) > 0: sentence.append(''.join(word)) verbose('current sentence becomes=', sentence) word = [] verbose('current word becomes=', word) self.paragraph.append(' '.join(sentence)) verbose('current paragraph becomes=', self.paragraph) return