def smarten_punctuation(container, report): from calibre.ebooks.conversion.preprocess import smarten_punctuation smartened = False for path in container.spine_items: name = container.abspath_to_name(path) changed = False with container.open(name, 'r+b') as f: html = container.decode(f.read()) newhtml = smarten_punctuation(html, container.log) if newhtml != html: changed = True report(_('Smartened punctuation in: %s')%name) newhtml = strip_encoding_declarations(newhtml) f.seek(0) f.truncate() f.write(codecs.BOM_UTF8 + newhtml.encode('utf-8')) if changed: # Add an encoding declaration (it will be added automatically when # serialized) root = container.parsed(name) for m in root.xpath('descendant::*[local-name()="meta" and @http-equiv]'): m.getparent().remove(m) container.dirty(name) smartened = True if not smartened: report(_('No punctuation that could be smartened found')) return smartened
def smarten_punctuation(container, report): from calibre.ebooks.conversion.preprocess import smarten_punctuation smartened = False for path in container.spine_items: name = container.abspath_to_name(path) changed = False with container.open(name, 'r+b') as f: html = container.decode(f.read()) newhtml = smarten_punctuation(html, container.log) if newhtml != html: changed = True report(_('Smartened punctuation in: %s') % name) newhtml = strip_encoding_declarations(newhtml) f.seek(0) f.truncate() f.write(codecs.BOM_UTF8 + newhtml.encode('utf-8')) if changed: # Add an encoding declaration (it will be added automatically when # serialized) root = container.parsed(name) for m in root.xpath( 'descendant::*[local-name()="meta" and @http-equiv]'): m.getparent().remove(m) container.dirty(name) smartened = True if not smartened: report(_('No punctuation that could be smartened found')) return smartened
def postprocess_book(self, oeb, opts, log): from calibre.ebooks.oeb.base import XHTML_NS, XPath, XHTML for item in oeb.spine: root = item.data if not hasattr(root, 'xpath'): continue for bad in ('metadata', 'guide'): metadata = XPath('//h:' + bad)(root) if metadata: for x in metadata: x.getparent().remove(x) body = XPath('//h:body')(root) if body: body = body[0] if len(body) == 1 and body[0].tag == XHTML('pre'): pre = body[0] from calibre.ebooks.txt.processor import convert_basic, \ separate_paragraphs_single_line from calibre.ebooks.chardet import xml_to_unicode from lxml import etree import copy self.log( 'LIT file with all text in singe <pre> tag detected') html = separate_paragraphs_single_line(pre.text) html = convert_basic(html).replace( '<html>', '<html xmlns="%s">' % XHTML_NS) html = xml_to_unicode(html, strip_encoding_pats=True, resolve_entities=True)[0] if opts.smarten_punctuation: # SmartyPants skips text inside <pre> tags from calibre.ebooks.conversion.preprocess import smarten_punctuation html = smarten_punctuation(html, self.log) root = etree.fromstring(html) body = XPath('//h:body')(root) pre.tag = XHTML('div') pre.text = '' for elem in body: ne = copy.deepcopy(elem) pre.append(ne)
def postprocess_book(self, oeb, opts, log): from calibre.ebooks.oeb.base import XHTML_NS, XPath, XHTML for item in oeb.spine: root = item.data if not hasattr(root, 'xpath'): continue for bad in ('metadata', 'guide'): metadata = XPath('//h:'+bad)(root) if metadata: for x in metadata: x.getparent().remove(x) body = XPath('//h:body')(root) if body: body = body[0] if len(body) == 1 and body[0].tag == XHTML('pre'): pre = body[0] from calibre.ebooks.txt.processor import convert_basic, \ separate_paragraphs_single_line from calibre.ebooks.chardet import xml_to_unicode from lxml import etree import copy self.log('LIT file with all text in singe <pre> tag detected') html = separate_paragraphs_single_line(pre.text) html = convert_basic(html).replace('<html>', '<html xmlns="%s">'%XHTML_NS) html = xml_to_unicode(html, strip_encoding_pats=True, resolve_entities=True)[0] if opts.smarten_punctuation: # SmartyPants skips text inside <pre> tags from calibre.ebooks.conversion.preprocess import smarten_punctuation html = smarten_punctuation(html, self.log) root = etree.fromstring(html) body = XPath('//h:body')(root) pre.tag = XHTML('div') pre.text = '' for elem in body: ne = copy.deepcopy(elem) pre.append(ne)
def smarten_punctuation(self): from calibre.ebooks.conversion.preprocess import smarten_punctuation html = self.html newhtml = smarten_punctuation(html) if html != newhtml: self.html = newhtml