def extract_text(self): text = u"" content = self["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, self.pdf) for operands, operator in content.operations: if operator == "Tj": _text = operands[0] if isinstance(_text, TextStringObject): text += _text elif operator == "T*": text += "\n" elif operator == "'": text += "\n" _text = operands[0] if isinstance(_text, TextStringObject): text += operands[0] elif operator == '"': _text = operands[2] if isinstance(_text, TextStringObject): text += "\n" text += _text elif operator == "TJ": for i in operands[0]: if isinstance(i, TextStringObject): text += i if text and not text.endswith(" "): text += " " # Don't let words concatenate return text
def extractPDFText(self): text = u"" content = self["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, self.pdf) # Note: we check all strings are TextStringObjects. ByteStringObjects # are strings where the byte->string encoding was unknown, so adding # them to the text here would be gibberish. for operands, operator in content.operations: if operator == "Tj": _text = operands[0] if isinstance(_text, TextStringObject): text += _text elif operator == "T*": text += "\n" elif operator == "'": text += "\n" _text = operands[0] if isinstance(_text, TextStringObject): text += operands[0] elif operator == '"': _text = operands[2] if isinstance(_text, TextStringObject): text += "\n" text += _text elif operator == "TJ": for i in operands[0]: if isinstance(i, TextStringObject): text += i elif operator == "k": text += "\n" return text
def extractOperators(self): ops = [] content = self["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, self.pdf) for op in content.operations: ops.append(op) return ops
def replace_text(cls, page, text, replace): # HACK from pyPdf.pdf import ContentStream, PageObject from pyPdf.generic import TextStringObject, NameObject content = ContentStream(page["/Contents"].getObject(), page.pdf) for idx in range(len(content.operations)): operands, operator = content.operations[idx] if operator == 'Tj': operands[0] = TextStringObject(operands[0].replace( text, replace)) new_page = PageObject.createBlankPage(page.pdf) new_page.mergePage(page) new_page[NameObject('/Contents')] = content return new_page
def pdf_add_content(content_string, page, scale=1, offsetx=0, offsety=0): """Add content to the end of the content stream of the PDF page. Inputs: content_string The PDF drawing commands to add, as a single string. page The pyPdf.pdf.PageObject to add the content to. scale Before adding the content, adjust the the coordinate offsetx system with a (uniform) scale factor and a offsety translation of offsetx and offsety. """ coord_trans = '%.2f 0 0 %.2f %.2f %.2f cm' % (scale, scale, offsetx, offsety) commands = '\n'.join(('Q', 'q', coord_trans, content_string, 'Q')) try: orig_content = page['/Contents'].getObject() except KeyError: orig_content = ArrayObject([]) stream = ContentStream(orig_content, page.pdf) stream.operations.insert(0, [[], 'q']) # Existing content may not restore stream.operations.append([[], commands]) # graphics state at the end. page[NameObject('/Contents')] = stream
def extract_text(self): """ Patched extractText() from pyPdf to put spaces between different text snippets. """ text = u"" content = self["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, self.pdf) # Note: we check all strings are TextStringObjects. ByteStringObjects # are strings where the byte->string encoding was unknown, so adding # them to the text here would be gibberish. for operands, operator in content.operations: if operator == "Tj": _text = operands[0] if isinstance(_text, TextStringObject): text += _text elif operator == "T*": text += "\n" elif operator == "'": text += "\n" _text = operands[0] if isinstance(_text, TextStringObject): text += operands[0] elif operator == '"': _text = operands[2] if isinstance(_text, TextStringObject): text += "\n" text += _text elif operator == "TJ": for i in operands[0]: if isinstance(i, TextStringObject): text += i if text and not text.endswith(" "): text += " " # Don't let words concatenate return text
def InsertXObject(self, name): " XObject can be an image or a 'form' (an arbitrary PDF sequence) " dlist = [] xobject = self.page["/Resources"].getObject()['/XObject'] stream = xobject[name] if stream.get('/Subtype') == '/Form': # insert contents into current page drawing if not name in self.formdrawings: # extract if not already done pdf_fonts = self.FetchFonts(stream) bbox = stream.get('/BBox') matrix = stream.get('/Matrix') form_ops = ContentStream(stream, self.pdfdoc).operations oplist = [([], 'q'), (matrix, 'cm')] # push state & apply matrix oplist.extend(form_ops) # add form contents oplist.append(([], 'Q')) # restore original state self.formdrawings[name] = self.ProcessOperators(oplist, pdf_fonts) dlist.extend(self.formdrawings[name]) elif stream.get('/Subtype') == '/Image': width = stream.get('/Width') height = stream.get('/Height') depth = stream.get('/BitsPerComponent') filters = stream.get("/Filter", ()) dlist.append(self.AddBitmap(stream._data, width, height, filters)) return dlist