def do_Do(self, xobjid): xobjid = literal_name(xobjid) try: xobj = stream_value(self.xobjmap[xobjid]) except KeyError: if STRICT: raise PDFInterpreterError('Undefined xobject id: %r' % xobjid) return if 1 <= self.debug: print >>stderr, 'Processing xobj: %r' % xobj subtype = xobj.dic.get('Subtype') if subtype is LITERAL_FORM and 'BBox' in xobj.dic: interpreter = self.dup() bbox = list_value(xobj.dic['BBox']) matrix = list_value(xobj.dic.get('Matrix', MATRIX_IDENTITY)) self.device.begin_figure(xobjid, bbox, matrix) interpreter.render_contents(dict_value(xobj.dic.get('Resources')), [xobj], ctm=mult_matrix(matrix, self.ctm)) self.device.end_figure(xobjid) elif subtype is LITERAL_IMAGE and 'Width' in xobj.dic and 'Height' in xobj.dic: self.device.begin_figure(xobjid, (0,0,1,1), MATRIX_IDENTITY) (w,h) = (xobj.dic['Width'], xobj.dic['Height']) self.device.render_image(xobj, (w,h)) self.device.end_figure(xobjid) else: # unsupported xobject type. pass return
def do_Do(self, xobjid): # the base of this function is basically copy-pasted from ancestor; unfortunately, I found no better solution xobjid = literal_name(xobjid) try: xobj = stream_value(self.xobjmap[xobjid]) except KeyError: if STRICT: raise PDFInterpreterError("Undefined xobject id: %r" % xobjid) return if self.debug: logging.info("Processing xobj: %r" % xobj) subtype = xobj.get("Subtype") if subtype is LITERAL_FORM and "BBox" in xobj: interpreter = self.dup() interpreter.is_first_level_call = None bbox = list_value(xobj["BBox"]) matrix = list_value(xobj.get("Matrix", MATRIX_IDENTITY)) # According to PDF reference 1.7 section 4.9.1, XObjects in # earlier PDFs (prior to v1.2) use the page's Resources entry # instead of having their own Resources entry. resources = dict_value(xobj.get("Resources")) or self.resources.copy() self.device.begin_figure(xobjid, bbox, matrix) interpreter.render_contents(resources, [xobj], ctm=mult_matrix(matrix, self.ctm)) self.device.end_figure(xobjid) # for (k,v) in interpreter.text_lines.iteritems(): # self.text_sequences[k + self.keyword_count] = v self.keyword_count += interpreter.keyword_count print "Included %i keywords" % interpreter.keyword_count else: # ignored xobject type. pass return
def draw_cid(self, ts, cid, force_space=False): verbose("drawing cid: ", cid) Trm = utils.mult_matrix((ts.Tfs * ts.Th, 0, 0, ts.Tfs, 0, ts.Trise), ts.Tm) if Trm[1] != 0: return if Trm[2] != 0: return verbose('Trm', Trm) if cid == 32 or force_space: Tw = ts.Tw else: Tw = 0 try: if force_space: unichar = ' ' else: try: unichar = ts.Tf.to_unichr(cid) except Exception as e: verbose(f"Failed to process {cid = }: {e}") unichar = ' ' except PDFUnicodeNotDefined: if MISSING_CHAR: unichar = MISSING_CHAR else: raise (gx, gy) = utils.apply_matrix_pt(Trm, (0, 0)) verbose("drawing unichar: '", unichar, "' @", gx, ",", gy) tfs = Trm[0] if self.current_block is None: self.current_block = (ts.Tf, tfs, gx, gy, [unichar]) elif ((self.current_block[0] == ts.Tf) and (self.current_block[1] == tfs)): self.current_block[4].append(unichar) else: self.blocks.append(self.current_block) self.current_block = (ts.Tf, tfs, gx, gy, [unichar]) verbose('current block: ', self.current_block) verbose('blocks: ', self.blocks) if force_space: pass else: w = ts.Tf.char_width(cid) if ts.Tf.is_vertical(): tx = 0 ty = self.new_ty(w, 0, ts.Tfs, ts.Tc, Tw) else: tx = self.new_tx(w, 0, ts.Tfs, ts.Tc, Tw, ts.Th) ty = 0 ts.Tm = utils.translate_matrix(ts.Tm, (tx, ty))
def render_string(self, textstate, seq): matrix = mult_matrix(textstate.matrix, self.ctm) font = textstate.font fontsize = textstate.fontsize scaling = textstate.scaling * .01 charspace = textstate.charspace * scaling wordspace = textstate.wordspace * scaling rise = textstate.rise if font.is_multibyte(): wordspace = 0 dxscale = .001 * fontsize * scaling if font.is_vertical(): textstate.linematrix = self.render_string_vertical( seq, matrix, textstate.linematrix, font, fontsize, scaling, charspace, wordspace, rise, dxscale) else: textstate.linematrix = self.render_string_horizontal( seq, matrix, textstate.linematrix, font, fontsize, scaling, charspace, wordspace, rise, dxscale) return
def do_cm(self, a1, b1, c1, d1, e1, f1): self.ctm = mult_matrix((a1,b1,c1,d1,e1,f1), self.ctm) self.device.set_ctm(self.ctm) return
def begin_figure(self, name, bbox, matrix): self._stack.append(self.cur_item) self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm)) return
def show_string(self, ts, array): verbose(ts) sentence = [] word = [] m = (ts.Tfs * ts.Th, 0, 0, ts.Tfs, 0, ts.Trise) applicable_Tm = utils.mult_matrix(m, ts.Tm) (sx, _, _, sy, tx, ty) = applicable_Tm current_state = (sx, sy, tx, ty) if self.last_state == None: self.paragraph = [] verbose('current paragraph becomes=', self.paragraph) elif current_state[0] == self.last_state[0]: verbose('DECISION: grouping the text object to last') else: verbose('DECISION: finalizing the paragraph') key = self.last_state[0] item = self.paragraph_map.get(key, '') if len(item) > 0: item = item = ' ' new_item = ' '.join(self.paragraph) self.paragraph_map[key] = item + new_item self.paragraph = [] verbose('current paragraph becomes=', self.paragraph) self.last_state = current_state for obj in array: verbose("processing obj=", obj) if utils.isnumber(obj): Tj = obj if Tj < WITHIN_WORD_MOVE_LIMIT: verbose("DECISION: new word") sentence.append(''.join(word)) verbose('current sentence becomes=', sentence) word = [] verbose('current word becomes=', word) else: verbose("DECISION: move inside the current word") if ts.Tf.is_vertical(): tx = 0 ty = ((Tj / 1000) * ts.Tfs) else: tx = ((Tj / 1000) * ts.Tfs) * ts.Th ty = 0 ts.Tm = utils.mult_matrix((1, 0, 0, 1, tx, ty), ts.Tm) else: for cid in ts.Tf.decode(obj): verbose("processing cid=", cid) m = (ts.Tfs * ts.Th, 0, 0, ts.Tfs, 0, ts.Trise) applicable_Tm = utils.mult_matrix(m, ts.Tm) if cid == 32: applicable_Tw = ts.Tw sentence.append(''.join(word)) verbose('current sentence becomes=', sentence) word = [] else: try: text = ts.Tf.to_unichr(cid) except PDFUnicodeNotDefined: if MISSING_CHAR: text = MISSING_CHAR else: raise word.append(text) verbose('current word becomes=', word) applicable_Tw = 0 w = ts.Tf.char_width(cid) if ts.Tf.is_vertical(): tx = 0 ty = ((w - 0) * ts.Tfs + ts.Tc + applicable_Tw) else: tx = ((w - 0) * ts.Tfs + ts.Tc + applicable_Tw) * ts.Th ty = 0 ts.Tm = utils.mult_matrix((1, 0, 0, 1, tx, ty), ts.Tm) if len(word) > 0: sentence.append(''.join(word)) verbose('current sentence becomes=', sentence) word = [] verbose('current word becomes=', word) self.paragraph.append(' '.join(sentence)) verbose('current paragraph becomes=', self.paragraph) return
def do_Td(self, tx, ty): verbose_operator("PDF OPERATOR Td: tx=", tx, ", ty=", ty) m = (1, 0, 0, 1, tx, ty) self.mpts.Tlm = utils.mult_matrix(m, self.mpts.Tlm) self.mpts.Tm = self.mpts.Tlm return
def begin_figure(self, name, bbox, matrix): super(PDFLocPageAnalyzer, self).begin_figure(name, bbox, matrix) self.cur_item = PDFLocFigure(name, bbox, mult_matrix(matrix, self.ctm))