def parse_html(markup): from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode, substitute_entites from calibre.utils.cleantext import clean_xml_chars if isinstance(markup, unicode_type): markup = strip_encoding_declarations(markup) markup = substitute_entites(markup) else: markup = xml_to_unicode(markup, strip_encoding_pats=True, resolve_entities=True)[0] markup = clean_xml_chars(markup) from html5_parser.soup import parse return parse(markup, return_root=False)
def smarten_punctuation(html, log=None): from calibre.utils.smartypants import smartyPants from calibre.ebooks.chardet import substitute_entites from calibre.ebooks.conversion.utils import HeuristicProcessor preprocessor = HeuristicProcessor(log=log) from uuid import uuid4 start = 'calibre-smartypants-'+str(uuid4()) stop = 'calibre-smartypants-'+str(uuid4()) html = html.replace('<!--', start) html = html.replace('-->', stop) html = preprocessor.fix_nbsp_indents(html) html = smartyPants(html) html = html.replace(start, '<!--') html = html.replace(stop, '-->') return substitute_entites(html)
def smarten_punctuation(html, log): from calibre.utils.smartypants import smartyPants from calibre.ebooks.chardet import substitute_entites from calibre.ebooks.conversion.utils import HeuristicProcessor preprocessor = HeuristicProcessor(log=log) from uuid import uuid4 start = 'calibre-smartypants-' + str(uuid4()) stop = 'calibre-smartypants-' + str(uuid4()) html = html.replace('<!--', start) html = html.replace('-->', stop) html = preprocessor.fix_nbsp_indents(html) html = smartyPants(html) html = html.replace(start, '<!--') html = html.replace(stop, '-->') # convert ellipsis to entities to prevent wrapping html = re.sub(r'(?u)(?<=\w)\s?(\.\s?){2}\.', '…', html) # convert double dashes to em-dash html = re.sub(r'\s--\s', u'\u2014', html) return substitute_entites(html)
def smarten_punctuation(html, log): from calibre.utils.smartypants import smartyPants from calibre.ebooks.chardet import substitute_entites from calibre.ebooks.conversion.utils import HeuristicProcessor preprocessor = HeuristicProcessor(log=log) from uuid import uuid4 start = 'calibre-smartypants-'+str(uuid4()) stop = 'calibre-smartypants-'+str(uuid4()) html = html.replace('<!--', start) html = html.replace('-->', stop) html = preprocessor.fix_nbsp_indents(html) html = smartyPants(html) html = html.replace(start, '<!--') html = html.replace(stop, '-->') # convert ellipsis to entities to prevent wrapping html = re.sub(r'(?u)(?<=\w)\s?(\.\s?){2}\.', '…', html) # convert double dashes to em-dash html = re.sub(r'\s--\s', u'\u2014', html) return substitute_entites(html)
def smarten_punctuation(html, log): from calibre.utils.smartypants import smartyPants from calibre.ebooks.chardet import substitute_entites from calibre.ebooks.conversion.utils import HeuristicProcessor preprocessor = HeuristicProcessor(log=log) from uuid import uuid4 start = "calibre-smartypants-" + str(uuid4()) stop = "calibre-smartypants-" + str(uuid4()) html = html.replace("<!--", start) html = html.replace("-->", stop) html = preprocessor.fix_nbsp_indents(html) html = smartyPants(html) html = html.replace(start, "<!--") html = html.replace(stop, "-->") # convert ellipsis to entities to prevent wrapping html = re.sub(r"(?u)(?<=\w)\s?(\.\s?){2}\.", "…", html) # convert double dashes to em-dash html = re.sub(r"\s--\s", u"\u2014", html) return substitute_entites(html)