def begin_tag(self, tag, props=None): s = '' if props: s = ''.join( ' %s="%s"' % (enc(k), enc(str(v))) for (k,v) in sorted(props.iteritems()) ) self.outfp.write('<%s%s>' % (enc(tag.name), s)) self.tag = tag return
def __renderChar(self, item): #if not self.__isLine: # return #font = enc(item.font.basefont) font = enc(item.fontname) #size = item.get_size() size = item.size fontChanged = False if (self.__font == None or (self.__font.fullName != font or self.__font.size != size)) or not self.__hasFont: # zmienil sie font lub nie jestesmy w obrebie tagu <span> dla fontu #if self.__font == None: # print "None", #else: # print self.__font.psname, if self.__hasFont: if self.__icu != None and self.__inWord: if not (self.__wordInd - 1) in self.__whites: self.__outfp.write("</span>") fontChanged = True self.__endSpecialTags(self.__font) self.__outfp.write("</span>") self.__font = self.__lib.findFont(self.__page, font).instantiate(size) #print self.__font.psname self.__outfp.write("<span style=\"") name = None if self.__fontMap != None: name = self.__fontMap.get(self.__font.name) if name == None: name = self.__font.name self.__outfp.write("font-family: " + self.__font.name) if self.__font.bold: self.__outfp.write("; font-weight: bold") if self.__font.italic: self.__outfp.write("; font-style: italic") #self.__outfp.write("; ps-name: " + font) self.__outfp.write("; font-size: " + str(self.__font.size)) self.__outfp.write("\">") self.__startSpecialTags(self.__font) self.__hasFont = True if self.__icu != None: if self.__ind in self.__divs or fontChanged: # TODO: I jezeli hasFont bylo False przy wywolaniu renderChar # i wypisalismy nowy font to wtedy nie moze byc srodek slowa (wiec __ind bedzie w self.__divs bo to 0) if not self.__wordInd in self.__whites: self.__outfp.write( "<span class=\"ocrx_word\" title=\"bbox " + bbox2str( changeCoords(self.__pagebbox, self.__divbboxes[ self.__wordInd])) + "\">") self.__wordInd += 1 self.__inWord = True self.__outfp.write(item.get_text().replace("<", "<").replace( "&", "&").encode("utf-8")) if self.__icu != None: self.__ind += 1 if self.__ind in self.__divs: if not (self.__wordInd - 1) in self.__whites: self.__outfp.write("</span>") self.__inWord = False
def render(item): if isinstance(item, LTPage): self.outfp.write('<a id="page_%s" data-bbox="%s" data-rotate="%d"></a>\n' % (item.pageid, bbox2str(item.bbox), item.rotate)) for child in item: render(child) elif isinstance(item, LTLine): self.outfp.write('<line linewidth="%d" bbox="%s" />\n' % (item.linewidth, bbox2str(item.bbox))) elif isinstance(item, LTRect): self.outfp.write('<rect linewidth="%d" bbox="%s" />\n' % (item.linewidth, bbox2str(item.bbox))) elif isinstance(item, LTCurve): self.outfp.write('<curve linewidth="%d" bbox="%s" pts="%s"/>\n' % (item.linewidth, bbox2str(item.bbox), item.get_pts())) elif isinstance(item, LTFigure): self.outfp.write('<figure name="%s" bbox="%s">\n' % (item.name, bbox2str(item.bbox))) for child in item: render(child) self.outfp.write('</figure>\n') elif isinstance(item, LTTextLine): self.outfp.write('<span data-bbox="%s"/>\n' % bbox2str(item.bbox)) for child in item: render(child) # self.outfp.write('</p>\n') elif isinstance(item, LTTextBox): wmode = '' if isinstance(item, LTTextBoxVertical): wmode = ' wmode="vertical"' self.outfp.write('<div id="%d" data-bbox="%s" data-wmode="%s"><p>\n' % (item.index, bbox2str(item.bbox), wmode)) for child in item: render(child) self.outfp.write('</p></div>\n') elif isinstance(item, LTChar): # self.outfp.write('<text font="%s" bbox="%s" size="%.3f">' % # (enc(item.fontname), bbox2str(item.bbox), item.size)) self.write_text(item.get_text()) # self.outfp.write('</text>\n') elif isinstance(item, LTText): self.outfp.write(item.get_text()) elif isinstance(item, LTImage): if self.imagewriter is not None: name = self.imagewriter.export_image(item) self.outfp.write('<img src="%s" width="%d" height="%d" />\n' % (enc(name), item.width, item.height)) else: self.outfp.write('<img width="%d" height="%d" />\n' % (item.width, item.height)) else: assert 0, item return
def render_string(self, textstate, seq): font = textstate.font text = '' for obj in seq: if not isinstance(obj, str): continue chars = font.decode(obj) for cid in chars: try: char = font.to_unicode(cid) text += char except PDFUnicodeNotDefined: pass self.outfp.write(enc(text, self.codec)) return
def write_text(self, text, item=None): if self.ignoring(): self.state.step(u'\n') return text = self.CONTROL.sub(u'', text) if item and text.strip(): self.register_font(item) if self.has_new_line: self.handle_new_line() if self.has_new_chunk: self.handle_new_chunk() if text: self.state.step(unicode(text)) self.handle_style() self.buffer.write(enc(text, 'utf-8')) self.last_char = text return
def write_header(self): self.outfp.write('<?xml version="1.0" encoding="%s" ?>\n' % self.codec) self.outfp.write('<html xmlns="http://www.w3.org/1999/xhtml">\n') self.outfp.write('<head>\n') self.outfp.write('<meta charset="%s"/>\n' % self.codec) if self.document is not None: contents = self.document.info[0] if contents is not None: for name in contents: if 'itle' in name: self.outfp.write('<title>%s</title>\n' % contents[name]) else: self.outfp.write('<meta name="%s" content="%s"/>\n' % (enc(name), contents[name]) ) self.outfp.write('<meta name="Note" content="Converted with PDFminer.py for xhtml format"/>\n') self.outfp.write('<link rel="stylesheet" type="text/css" href="css/style.css"/>\n') self.outfp.write('</head>\n') self.outfp.write('<body>\n') return
def render(item): if isinstance(item, LTPage): self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' % (item.id, item.get_bbox(), item.rotate)) for child in item: render(child) self.outfp.write('</page>\n') elif isinstance(item, LTLine): self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />' % (item.linewidth, item.direction, item.get_bbox())) elif isinstance(item, LTRect): self.outfp.write('<rect linewidth="%d" bbox="%s" />' % (item.linewidth, item.get_bbox())) elif isinstance(item, LTFigure): self.outfp.write('<figure id="%s">\n' % (item.id)) for child in item: render(child) self.outfp.write('</figure>\n') elif isinstance(item, LTTextLine): self.outfp.write('<textline bbox="%s">\n' % (item.get_bbox())) for child in item: render(child) self.outfp.write('</textline>\n') elif isinstance(item, LTTextBox): self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, item.get_bbox())) for child in item: render(child) self.outfp.write('</textbox>\n') elif isinstance(item, LTTextItem): self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' % (enc(item.font.fontname), item.is_vertical(), item.get_bbox(), item.fontsize)) self.write(item.text) self.outfp.write('</text>\n') elif isinstance(item, LTText): self.outfp.write('<text>%s</text>\n' % item.text) else: assert 0, item return
def render(item): if isinstance(item, LTPage): metainfo = { 'pid': item.pageid, 'rotate': item.rotate, 'x0': item.bbox[0], 'y0': item.bbox[1], 'x1': item.bbox[2], 'y1': item.bbox[3] } self.page = { 'metainfo': metainfo, 'text': [], 'line': [], 'rect': [], 'curve': [], 'figure': [], 'textline': [], 'textbox': [], 'textgroup': [], 'image': [] } for child in item: render(child) if item.groups is not None: for group in item.groups: show_group(group) self.doc.append(self.page) elif isinstance(item, LTLine): self.page['line'].append({ 'linewidth': item.linewidth, 'x0': item.bbox[0], 'y0': item.bbox[1], 'x1': item.bbox[2], 'y1': item.bbox[3] }) elif isinstance(item, LTRect): self.page['rect'].append({ 'linewidth': item.linewidth, 'x0': item.bbox[0], 'y0': item.bbox[1], 'x1': item.bbox[2], 'y1': item.bbox[3] }) elif isinstance(item, LTCurve): curve = { 'linewidth': item.linewidth, 'pts': item.get_pts(), 'x0': item.bbox[0], 'y0': item.bbox[1], 'x1': item.bbox[2], 'y1': item.bbox[3] } self.page['curve'].append(curve) elif isinstance(item, LTFigure): self.page['figure'].append({ 'name': item.name, 'x0': item.bbox[0], 'y0': item.bbox[1], 'x1': item.bbox[2], 'y1': item.bbox[3] }) for child in item: render(child) elif isinstance(item, LTTextLine): self.page['textline'].append({ 'x0': item.bbox[0], 'y0': item.bbox[1], 'x1': item.bbox[2], 'y1': item.bbox[3] }) for child in item: render(child) elif isinstance(item, LTTextBox): wmode = 'vertical' if isinstance( item, LTTextBoxVertical) else 'horizontal' tb = { 'id': item.index, 'wmode': wmode, 'x0': item.bbox[0], 'y0': item.bbox[1], 'x1': item.bbox[2], 'y1': item.bbox[3] } self.page['textbox'].append(tb) for child in item: render(child) elif isinstance(item, LTChar): # bbox (x0,y0,x1,y1) # x0: the distance from the left of the page to the left edge of the box. # y0: the distance from the bottom of the page to the lower edge of the box. # x1: the distance from the left of the page to the right edge of the box. # y1: the distance from the bottom of the page to the upper edge of the box. txt = { 'text': item.get_text(), 'font': enc(item.fontname), 'size': item.size, 'colorspace': item.ncs.name, 'color': json.dumps(item.graphicstate.ncolor), 'x0': item.bbox[0], 'y0': item.bbox[1], 'x1': item.bbox[2], 'y1': item.bbox[3] } self.page['text'].append(txt) elif isinstance(item, LTText): # LTText is the interface for things that have text. # LTAnno inherits from LTText. self.page['text'].append({'text': item.get_text()}) elif isinstance(item, LTImage): if self.imagewriter is not None: name = self.imagewriter.export_image(item) img = { 'src': enc(name), 'width': item.width, 'height': item.height } self.page['image'].append(img) else: self.page['image'].append({ 'width': item.width, 'height': item.height }) else: assert False, str(('Unhandled', item)) return
def render(item): if isinstance(item, LTPage): s = '<page id="%s" bbox="%s" rotate="%d">\n' % ( item.pageid, bbox2str(item.bbox), item.rotate, ) self.current_page = RpaPdfPage(item.pageid, item.bbox, item.rotate) self.write(s) for child in item: render(child) if item.groups is not None: self.write("<layout>\n") for group in item.groups: show_group(group) self.write("</layout>\n") self.write("</page>\n") self.rpa_pdf_document.add_page(self.current_page) elif isinstance(item, LTLine): s = '<line linewidth="%d" bbox="%s" />\n' % ( item.linewidth, bbox2str(item.bbox), ) self.write(s) elif isinstance(item, LTRect): s = '<rect linewidth="%d" bbox="%s" />\n' % ( item.linewidth, bbox2str(item.bbox), ) self.write(s) elif isinstance(item, LTCurve): s = '<curve linewidth="%d" bbox="%s" pts="%s"/>\n' % ( item.linewidth, bbox2str(item.bbox), item.get_pts(), ) self.write(s) elif isinstance(item, LTFigure): s = '<figure name="%s" bbox="%s">\n' % (item.name, bbox2str(item.bbox)) self.write(s) self.figure = RpaFigure(item.name, item.bbox) for child in item: self.figure.set_item(item) render(child) self.write("</figure>\n") self.current_page.add_content(self.figure) self.figure = None elif isinstance(item, LTTextLine): self.write('<textline bbox="%s">\n' % bbox2str(item.bbox)) for child in item: render(child) self.write("</textline>\n") elif isinstance(item, LTTextBox): wmode = "" if isinstance(item, LTTextBoxVertical): wmode = ' wmode="vertical"' s = '<textbox id="%d" bbox="%s"%s>\n' % ( item.index, bbox2str(item.bbox), wmode, ) box = RpaTextBox(item.index, item.bbox, wmode) self.write(s) box.set_item(item) self.current_page.add_content(box) for child in item: render(child) self.write("</textbox>\n") elif isinstance(item, LTChar): s = ('<text font="%s" bbox="%s" colourspace="%s" ' 'ncolour="%s" size="%.3f">' % ( enc(item.fontname), bbox2str(item.bbox), item.ncs.name, item.graphicstate.ncolor, item.size, )) self.write(s) self.write_text(item.get_text()) self.write("</text>\n") elif isinstance(item, LTText): self.write("<text>%s</text>\n" % item.get_text()) elif isinstance(item, LTImage): if self.figure: self.figure.set_item(item) if self.imagewriter is not None: name = self.imagewriter.export_image(item) self.write('<image src="%s" width="%d" height="%d" />\n' % (enc(name), item.width, item.height)) else: self.write('<image width="%d" height="%d" />\n' % (item.width, item.height)) else: assert False, str(("Unhandled", item))
def encode_text(self): for textline in self.taglists['textline']: textline["text"] = subst_control_chars(enc(textline["text"]))
def write_text(self, text: str): if self.stripcontrol: text = self.CONTROL.sub("", text) self.write(enc(text))
def write_text(self, text): if self.stripcontrol: text = self.CONTROL.sub(u'', text) self.outfp.write(enc(text, self.codec)) return
def end_tag(self): assert self.tag self.outfp.write('</%s>' % enc(self.tag.name)) self.tag = None return
def write(self, text): self.outfp.write(enc(text, self.codec)) return
def render(item): self.__stack.append(self.__node) parent = self.__node if isinstance(item, LTPage): self.__num += 1 self.__page = PDFMinerNode("page") self.__page.setPageId(self.__num) self.__setBbox(self.__page, bbox2str(normalize(item.bbox))) if self.__lib != None: self.__lib.addBbox(self.__page.getPageId(), self.__page.getBbox()) self.__node = self.__page for child in item: render(child) elif isinstance(item, LTLine): pass elif isinstance(item, LTRect): pass elif isinstance(item, LTCurve): pass elif isinstance(item, LTFigure): pass elif isinstance(item, LTTextLine): #print "textline" self.__node = PDFMinerNode("textline") self.__node.setPageId(self.__num) parent.add(self.__node) self.__setBbox(self.__node, bbox2str(normalize(item.bbox))) for child in item: render(child) elif isinstance(item, LTTextBox): self.__node = PDFMinerNode("textbox", item.index) self.__node.setPageId(self.__num) parent.add(self.__node) self.__setBbox(self.__node, bbox2str(normalize(item.bbox))) for child in item: render(child) elif isinstance(item, LTChar): #font = enc(item.font.fontname) font = enc(item.fontname) #size = item.get_size() size = item.size self.__font = self.__fontDict.get(font + str(size)) if self.__font == None: self.__font = self.__fontDict.setdefault( font + str(size), self.__lib.findFont(self.__pdfminerpage, font).instantiate(size)) self.__node = PDFMinerNode("text") self.__node.setPageId(self.__num) self.__node.setLeaf() #self.__node.setContentType("Text") parent.add(self.__node) #print parent.textOf() #assert(parent.textOf() == "textline") self.__setBbox(self.__node, bbox2str(normalize(item.bbox))) if self.__font != None: self.__node.add(self.__font) #assert(self.__node.textOf() == "text") self.__node.add(item.get_text()) elif isinstance(item, LTText): pass # TODO: NOTE ignorujemy tekst pusty (tu byly same spacje) #self.outfp.write('<text>%s</text>\n' % item.get_text()) elif isinstance(item, LTImage): pass elif isinstance(item, LTTextGroup): self.__node = PDFMinerNode("textgroup") self.__node.setPageId(self.__num) parent.add(self.__node) self.__setBbox(self.__node, bbox2str(normalize(item.bbox))) for child in item: render(child) else: assert 0, item self.__node = self.__stack.pop() return
def write_text(self, text): if self.stripcontrol: text = self.CONTROL.sub('', text) self.write(enc(text)) return
def render(item): if isinstance(item, LTPage): self.current_page = Page(item.pageid, item.bbox, item.rotate) self.write(self.current_page.tag + "\n") for child in item: render(child) if item.groups is not None: self.write("<layout>\n") for group in item.groups: show_group(group) self.write("</layout>\n") self.write("</page>\n") self.active_pdf_document.add_page(self.current_page) elif isinstance(item, LTLine): s = '<line linewidth="%d" bbox="%s" />\n' % ( item.linewidth, bbox2str(item.bbox), ) self.write(s) elif isinstance(item, LTRect): s = '<rect linewidth="%d" bbox="%s" />\n' % ( item.linewidth, bbox2str(item.bbox), ) self.write(s) elif isinstance(item, LTCurve): s = '<curve linewidth="%d" bbox="%s" pts="%s"/>\n' % ( item.linewidth, bbox2str(item.bbox), item.get_pts(), ) self.write(s) elif isinstance(item, LTFigure): figure = Figure(item) s = '<figure name="%s" bbox="%s">\n' % ( item.name, bbox2str(item.bbox), ) self.write(s) for child in item: render(child) self.write("</figure>\n") self._add_unique_figure(figure) elif isinstance(item, LTTextLine): self.write('<textline bbox="%s">\n' % bbox2str(item.bbox)) for child in item: render(child) self.write("</textline>\n") elif isinstance(item, LTTextBox): wmode = "" if isinstance(item, LTTextBoxVertical): wmode = ' wmode="vertical"' s = '<textbox id="%d" bbox="%s"%s>\n' % ( item.index, bbox2str(item.bbox), wmode, ) box = TextBox(item.index, item=item, trim=self.trim) self.write(s) self.current_page.add_content(box) for child in item: render(child) self.write("</textbox>\n") elif isinstance(item, LTChar): s = ('<text font="%s" bbox="%s" colourspace="%s" ' 'ncolour="%s" size="%.3f">' % ( enc(item.fontname), bbox2str(item.bbox), item.ncs.name, item.graphicstate.ncolor, item.size, )) self.write(s) self.write_text(item.get_text()) self.write("</text>\n") elif isinstance(item, LTText): self.write("<text>%s</text>\n" % item.get_text()) elif isinstance(item, LTImage): figure = Figure(item) if self.imagewriter is not None: name = self.imagewriter.export_image(item) self.write('<image src="%s" width="%d" height="%d" />\n' % (enc(name), item.width, item.height)) else: self.write('<image width="%d" height="%d" />\n' % (item.width, item.height)) self._add_unique_figure(figure) else: self._logger.warning("Unknown item: %r", item)