def smarten_punctuation(self): preprocessor = HeuristicProcessor(log=self.log) for name in self.get_html_names(): self.log.info("Smartening punctuation for file {0}".format(name)) html = self.get_raw(name) html = html.encode("UTF-8") # Fix non-breaking space indents html = preprocessor.fix_nbsp_indents(html) # Smarten punctuation html = smartyPants(html) # Ellipsis to HTML entity html = re.sub(ur'(?u)(?<=\w)\s?(\.\s+?){2}\.', '…', html, flags=re.UNICODE | re.MULTILINE) # Double-dash and unicode char code to em-dash html = string.replace(html, '---', ' – ') html = string.replace(html, u"\x97", ' – ') html = string.replace(html, '--', ' — ') html = string.replace(html, u"\u2014", ' — ') html = string.replace(html, u"\u2013", ' – ') # Fix comment nodes that got mangled html = string.replace(html, u'<! — ', u'<!-- ') html = string.replace(html, u' — >', u' -->') # Remove Unicode replacement characters html = string.replace(html, u"\uFFFD", "") self.dirty(name) self.flush_cache()
def smarten_punctuation(self): preprocessor = HeuristicProcessor(log=self.log) for name in self.get_html_names(): self.log.debug("Smartening punctuation for file {0}".format(name)) html = self.get_raw(name) html = html.encode("UTF-8") # Fix non-breaking space indents html = preprocessor.fix_nbsp_indents(html) # Smarten punctuation html = smartyPants(html) # Ellipsis to HTML entity html = re.sub(r'(?u)(?<=\w)\s?(\.\s+?){2}\.', '…', html) # Double-dash and unicode char code to em-dash html = string.replace(html, '---', ' – ') html = string.replace(html, u"\x97", ' – ') html = string.replace(html, '--', ' — ') html = string.replace(html, u"\u2014", ' — ') html = string.replace(html, u"\u2013", ' – ') html = string.replace(html, u"...", "…") # Remove Unicode replacement characters html = string.replace(html, u"\uFFFD", "") self.set(name, html)
def smarten_punctuation(self): """Convert standard punctuation to "smart" punctuation.""" preprocessor = HeuristicProcessor(log=self.log) for name in self.html_names(): self.log.info("Smartening punctuation for file {0}".format(name)) html = self.get_raw(name, force_unicode=True) if html is None: continue # Fix non-breaking space indents html = preprocessor.fix_nbsp_indents(html) # Smarten punctuation html = smartyPants(html) # Ellipsis to HTML entity html = ELLIPSIS_RE.sub("…", html) # Double-dash and unicode char code to em-dash html = string.replace(html, "---", " – ") html = string.replace(html, "\x97", " – ") html = string.replace(html, "\u2013", " – ") html = string.replace(html, "--", " — ") html = string.replace(html, "\u2014", " — ") # Fix comment nodes that got mangled html = string.replace(html, "<! — ", "<!-- ") html = string.replace(html, " — >", " -->") self.dirty(name) self.flush_cache()
def __smarten_punctuation_impl(self, name): # type: (str) -> None """Convert standard punctuation to "smart" punctuation.""" preprocessor = HeuristicProcessor(log=self.log) self.log.debug("Smartening punctuation for file {0}".format(name)) html = self.raw_data(name, decode=True, normalize_to_nfc=True) if html is None: raise Exception( _( # noqa: F821 - _ is defined in globals by calibre "No HTML content in file {0}").format(name)) # Fix non-breaking space indents html = preprocessor.fix_nbsp_indents(html) # Smarten punctuation html = smartyPants(html) # Ellipsis to HTML entity html = ELLIPSIS_RE.sub("…", html) # Double-dash and unicode char code to em-dash html = html.replace("---", " – ") html = html.replace("\x97", " – ") html = html.replace("\u2013", " – ") html = html.replace("--", " — ") html = html.replace("\u2014", " — ") # Fix comment nodes that got mangled html = html.replace("<! — ", "<!-- ") html = html.replace(" — >", " -->") self.replace(name, self.parse_xhtml(html)) self.flush_cache()
def smarten_punctuation(self): preprocessor = HeuristicProcessor(log = self.log) for name in self.get_html_names(): html = self.get_raw(name) html = html.encode("UTF-8") # Fix non-breaking space indents html = preprocessor.fix_nbsp_indents(html) # Smarten punctuation html = smartyPants(html) # Ellipsis to HTML entity html = re.sub(r'(?u)(?<=\w)\s?(\.\s+?){2}\.', '…', html) # Double-dash and unicode char code to em-dash html = string.replace(html, '---', ' – ') html = string.replace(html, u"\x97", ' – ') html = string.replace(html, '--', ' — ') html = string.replace(html, u"\u2014", ' — ') html = string.replace(html, u"\u2013", ' – ') html = string.replace(html, u"...", "…") # Remove Unicode replacement characters html = string.replace(html, u"\uFFFD", "") self.set(name, html)
def smarten_punctuation(self): """Convert standard punctuation to "smart" punctuation.""" preprocessor = HeuristicProcessor(log=self.log) for name in self.html_names(): self.log.debug("Smartening punctuation for file {0}".format(name)) html = self.raw_data(name, decode=True, normalize_to_nfc=True) if html is None: continue # Fix non-breaking space indents html = preprocessor.fix_nbsp_indents(html) # Smarten punctuation html = smartyPants(html) # Ellipsis to HTML entity html = ELLIPSIS_RE.sub("…", html) # Double-dash and unicode char code to em-dash html = string.replace(html, "---", " – ") html = string.replace(html, "\x97", " – ") html = string.replace(html, "\u2013", " – ") html = string.replace(html, "--", " — ") html = string.replace(html, "\u2014", " — ") # Fix comment nodes that got mangled html = string.replace(html, "<! — ", "<!-- ") html = string.replace(html, " — >", " -->") with self.open(name, "wb") as f: f.write( html.encode(self.encoding_map.get(name, self.used_encoding)))
def smarten_punctuation(html, log=None): from calibre.utils.smartypants import smartyPants from calibre.ebooks.chardet import substitute_entites from calibre.ebooks.conversion.utils import HeuristicProcessor preprocessor = HeuristicProcessor(log=log) from uuid import uuid4 start = 'calibre-smartypants-'+str(uuid4()) stop = 'calibre-smartypants-'+str(uuid4()) html = html.replace('<!--', start) html = html.replace('-->', stop) html = preprocessor.fix_nbsp_indents(html) html = smartyPants(html) html = html.replace(start, '<!--') html = html.replace(stop, '-->') return substitute_entites(html)
def smarten_punctuation(html, log): from calibre.utils.smartypants import smartyPants from calibre.ebooks.chardet import substitute_entites from calibre.ebooks.conversion.utils import HeuristicProcessor preprocessor = HeuristicProcessor(log=log) from uuid import uuid4 start = 'calibre-smartypants-' + str(uuid4()) stop = 'calibre-smartypants-' + str(uuid4()) html = html.replace('<!--', start) html = html.replace('-->', stop) html = preprocessor.fix_nbsp_indents(html) html = smartyPants(html) html = html.replace(start, '<!--') html = html.replace(stop, '-->') # convert ellipsis to entities to prevent wrapping html = re.sub(r'(?u)(?<=\w)\s?(\.\s?){2}\.', '…', html) # convert double dashes to em-dash html = re.sub(r'\s--\s', u'\u2014', html) return substitute_entites(html)
def smarten_punctuation(html, log): from calibre.utils.smartypants import smartyPants from calibre.ebooks.chardet import substitute_entites from calibre.ebooks.conversion.utils import HeuristicProcessor preprocessor = HeuristicProcessor(log=log) from uuid import uuid4 start = 'calibre-smartypants-'+str(uuid4()) stop = 'calibre-smartypants-'+str(uuid4()) html = html.replace('<!--', start) html = html.replace('-->', stop) html = preprocessor.fix_nbsp_indents(html) html = smartyPants(html) html = html.replace(start, '<!--') html = html.replace(stop, '-->') # convert ellipsis to entities to prevent wrapping html = re.sub(r'(?u)(?<=\w)\s?(\.\s?){2}\.', '…', html) # convert double dashes to em-dash html = re.sub(r'\s--\s', u'\u2014', html) return substitute_entites(html)
def smarten_punctuation(html, log): from calibre.utils.smartypants import smartyPants from calibre.ebooks.chardet import substitute_entites from calibre.ebooks.conversion.utils import HeuristicProcessor preprocessor = HeuristicProcessor(log=log) from uuid import uuid4 start = "calibre-smartypants-" + str(uuid4()) stop = "calibre-smartypants-" + str(uuid4()) html = html.replace("<!--", start) html = html.replace("-->", stop) html = preprocessor.fix_nbsp_indents(html) html = smartyPants(html) html = html.replace(start, "<!--") html = html.replace(stop, "-->") # convert ellipsis to entities to prevent wrapping html = re.sub(r"(?u)(?<=\w)\s?(\.\s?){2}\.", "…", html) # convert double dashes to em-dash html = re.sub(r"\s--\s", u"\u2014", html) return substitute_entites(html)