def smarten_punctuation(self): preprocessor = HeuristicProcessor(log=self.log) for name in self.get_html_names(): self.log.info("Smartening punctuation for file {0}".format(name)) html = self.get_raw(name) html = html.encode("UTF-8") # Fix non-breaking space indents html = preprocessor.fix_nbsp_indents(html) # Smarten punctuation html = smartyPants(html) # Ellipsis to HTML entity html = re.sub(ur'(?u)(?<=\w)\s?(\.\s+?){2}\.', '…', html, flags=re.UNICODE | re.MULTILINE) # Double-dash and unicode char code to em-dash html = string.replace(html, '---', ' – ') html = string.replace(html, u"\x97", ' – ') html = string.replace(html, '--', ' — ') html = string.replace(html, u"\u2014", ' — ') html = string.replace(html, u"\u2013", ' – ') # Fix comment nodes that got mangled html = string.replace(html, u'<! — ', u'<!-- ') html = string.replace(html, u' — >', u' -->') # Remove Unicode replacement characters html = string.replace(html, u"\uFFFD", "") self.dirty(name) self.flush_cache()
def clean_markup(self): preprocessor = HeuristicProcessor(log=self.log) for name in self.get_html_names(): self.log.debug("Cleaning markup for file {0}".format(name)) html = self.get_raw(name) html = html.encode("UTF-8") # Replace unicode dashes with ASCII representations - smarten punctuation picks this up if asked for html = string.replace(html, u"\u2014", ' -- ') html = string.replace(html, u"\u2013", ' --- ') html = string.replace(html, u"\x97", ' --- ') # Get rid of Microsoft cruft html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html) html = re.sub(r'(?i)</?st1:\w+>', '', html) # Re-open self-closing paragraph tags html = re.sub(r'<p[^>/]*/>', '<p></p>', html) # Remove empty headings html = re.sub(r'(?i)<h\d+>\s*</h\d+>', '', html) # Remove Unicode replacement characters html = string.replace(html, u"\uFFFD", "") self.set(name, html)
def smarten_punctuation(self): preprocessor = HeuristicProcessor(log=self.log) for name in self.get_html_names(): self.log.debug("Smartening punctuation for file {0}".format(name)) html = self.get_raw(name) html = html.encode("UTF-8") # Fix non-breaking space indents html = preprocessor.fix_nbsp_indents(html) # Smarten punctuation html = smartyPants(html) # Ellipsis to HTML entity html = re.sub(r'(?u)(?<=\w)\s?(\.\s+?){2}\.', '…', html) # Double-dash and unicode char code to em-dash html = string.replace(html, '---', ' – ') html = string.replace(html, u"\x97", ' – ') html = string.replace(html, '--', ' — ') html = string.replace(html, u"\u2014", ' — ') html = string.replace(html, u"\u2013", ' – ') html = string.replace(html, u"...", "…") # Remove Unicode replacement characters html = string.replace(html, u"\uFFFD", "") self.set(name, html)
def __smarten_punctuation_impl(self, name): # type: (str) -> None """Convert standard punctuation to "smart" punctuation.""" preprocessor = HeuristicProcessor(log=self.log) self.log.debug("Smartening punctuation for file {0}".format(name)) html = self.raw_data(name, decode=True, normalize_to_nfc=True) if html is None: raise Exception( _( # noqa: F821 - _ is defined in globals by calibre "No HTML content in file {0}").format(name)) # Fix non-breaking space indents html = preprocessor.fix_nbsp_indents(html) # Smarten punctuation html = smartyPants(html) # Ellipsis to HTML entity html = ELLIPSIS_RE.sub("…", html) # Double-dash and unicode char code to em-dash html = html.replace("---", " – ") html = html.replace("\x97", " – ") html = html.replace("\u2013", " – ") html = html.replace("--", " — ") html = html.replace("\u2014", " — ") # Fix comment nodes that got mangled html = html.replace("<! — ", "<!-- ") html = html.replace(" — >", " -->") self.replace(name, self.parse_xhtml(html)) self.flush_cache()
def smarten_punctuation(self): """Convert standard punctuation to "smart" punctuation.""" preprocessor = HeuristicProcessor(log=self.log) for name in self.html_names(): self.log.debug("Smartening punctuation for file {0}".format(name)) html = self.raw_data(name, decode=True, normalize_to_nfc=True) if html is None: continue # Fix non-breaking space indents html = preprocessor.fix_nbsp_indents(html) # Smarten punctuation html = smartyPants(html) # Ellipsis to HTML entity html = ELLIPSIS_RE.sub("…", html) # Double-dash and unicode char code to em-dash html = string.replace(html, "---", " – ") html = string.replace(html, "\x97", " – ") html = string.replace(html, "\u2013", " – ") html = string.replace(html, "--", " — ") html = string.replace(html, "\u2014", " — ") # Fix comment nodes that got mangled html = string.replace(html, "<! — ", "<!-- ") html = string.replace(html, " — >", " -->") with self.open(name, "wb") as f: f.write( html.encode(self.encoding_map.get(name, self.used_encoding)))
def smarten_punctuation(html, log=None): from calibre.utils.smartypants import smartyPants from calibre.ebooks.chardet import substitute_entites from calibre.ebooks.conversion.utils import HeuristicProcessor preprocessor = HeuristicProcessor(log=log) from uuid import uuid4 start = 'calibre-smartypants-'+str(uuid4()) stop = 'calibre-smartypants-'+str(uuid4()) html = html.replace('<!--', start) html = html.replace('-->', stop) html = preprocessor.fix_nbsp_indents(html) html = smartyPants(html) html = html.replace(start, '<!--') html = html.replace(stop, '-->') return substitute_entites(html)
def smarten_punctuation(html, log): from calibre.utils.smartypants import smartyPants from calibre.ebooks.chardet import substitute_entites from calibre.ebooks.conversion.utils import HeuristicProcessor preprocessor = HeuristicProcessor(log=log) from uuid import uuid4 start = 'calibre-smartypants-' + str(uuid4()) stop = 'calibre-smartypants-' + str(uuid4()) html = html.replace('<!--', start) html = html.replace('-->', stop) html = preprocessor.fix_nbsp_indents(html) html = smartyPants(html) html = html.replace(start, '<!--') html = html.replace(stop, '-->') # convert ellipsis to entities to prevent wrapping html = re.sub(r'(?u)(?<=\w)\s?(\.\s?){2}\.', '…', html) # convert double dashes to em-dash html = re.sub(r'\s--\s', u'\u2014', html) return substitute_entites(html)
def __call__(self, html, remove_special_chars=None, get_preprocess_html=False): if remove_special_chars is not None: html = remove_special_chars.sub('', html) html = html.replace('\0', '') is_pdftohtml = self.is_pdftohtml(html) if self.is_baen(html): rules = [] elif self.is_book_designer(html): rules = self.BOOK_DESIGNER elif is_pdftohtml: rules = self.PDFTOHTML else: rules = [] start_rules = [] if not getattr(self.extra_opts, 'keep_ligatures', False): html = _ligpat.sub(lambda m:LIGATURES[m.group()], html) user_sr_rules = {} # Function for processing search and replace def do_search_replace(search_pattern, replace_txt): from calibre.ebooks.conversion.search_replace import compile_regular_expression try: search_re = compile_regular_expression(search_pattern) if not replace_txt: replace_txt = '' rules.insert(0, (search_re, replace_txt)) user_sr_rules[(search_re, replace_txt)] = search_pattern except Exception as e: self.log.error('Failed to parse %r regexp because %s' % (search, as_unicode(e))) # search / replace using the sr?_search / sr?_replace options for i in range(1, 4): search, replace = 'sr%d_search'%i, 'sr%d_replace'%i search_pattern = getattr(self.extra_opts, search, '') replace_txt = getattr(self.extra_opts, replace, '') if search_pattern: do_search_replace(search_pattern, replace_txt) # multi-search / replace using the search_replace option search_replace = getattr(self.extra_opts, 'search_replace', None) if search_replace: search_replace = json.loads(search_replace) for search_pattern, replace_txt in reversed(search_replace): do_search_replace(search_pattern, replace_txt) end_rules = [] # delete soft hyphens - moved here so it's executed after header/footer removal if is_pdftohtml: # unwrap/delete soft hyphens end_rules.append((re.compile(u'[](</p>\s*<p>\s*)+\s*(?=[[a-z\d])'), lambda match: '')) # unwrap/delete soft hyphens with formatting end_rules.append((re.compile(u'[]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: '')) length = -1 if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01: docanalysis = DocAnalysis('pdf', html) length = docanalysis.line_length(getattr(self.extra_opts, 'unwrap_factor')) if length: # print "The pdf line length returned is " + str(length) # unwrap em/en dashes end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: '')) end_rules.append( # Un wrap using punctuation (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), # noqa ) for rule in self.PREPROCESS + start_rules: html = rule[0].sub(rule[1], html) if self.regex_wizard_callback is not None: self.regex_wizard_callback(self.current_href, html) if get_preprocess_html: return html def dump(raw, where): import os dp = getattr(self.extra_opts, 'debug_pipeline', None) if dp and os.path.exists(dp): odir = os.path.join(dp, 'input') if os.path.exists(odir): odir = os.path.join(odir, where) if not os.path.exists(odir): os.makedirs(odir) name, i = None, 0 while not name or os.path.exists(os.path.join(odir, name)): i += 1 name = '%04d.html'%i with open(os.path.join(odir, name), 'wb') as f: f.write(raw.encode('utf-8')) # dump(html, 'pre-preprocess') for rule in rules + end_rules: try: html = rule[0].sub(rule[1], html) except Exception as e: if rule in user_sr_rules: self.log.error( 'User supplied search & replace rule: %s -> %s ' 'failed with error: %s, ignoring.'%( user_sr_rules[rule], rule[1], e)) else: raise if is_pdftohtml and length > -1: # Dehyphenate dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log) html = dehyphenator(html,'html', length) if is_pdftohtml: from calibre.ebooks.conversion.utils import HeuristicProcessor pdf_markup = HeuristicProcessor(self.extra_opts, None) totalwords = 0 if pdf_markup.get_word_count(html) > 7000: html = pdf_markup.markup_chapters(html, totalwords, True) # dump(html, 'post-preprocess') # Handle broken XHTML w/ SVG (ugh) if 'svg:' in html and SVG_NS not in html: html = html.replace( '<html', '<html xmlns:svg="%s"' % SVG_NS, 1) if 'xlink:' in html and XLINK_NS not in html: html = html.replace( '<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1) html = XMLDECL_RE.sub('', html) if getattr(self.extra_opts, 'asciiize', False): from calibre.utils.localization import get_udc from calibre.utils.mreplace import MReplace unihandecoder = get_udc() mr = MReplace(data={u'«':u'<'*3, u'»':u'>'*3}) html = mr.mreplace(html) html = unihandecoder.decode(html) if getattr(self.extra_opts, 'enable_heuristics', False): from calibre.ebooks.conversion.utils import HeuristicProcessor preprocessor = HeuristicProcessor(self.extra_opts, self.log) html = preprocessor(html) if is_pdftohtml: html = html.replace('<!-- created by calibre\'s pdftohtml -->', '') if getattr(self.extra_opts, 'smarten_punctuation', False): html = smarten_punctuation(html, self.log) try: unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars except AttributeError: unsupported_unicode_chars = u'' if unsupported_unicode_chars: from calibre.utils.localization import get_udc unihandecoder = get_udc() for char in unsupported_unicode_chars: asciichar = unihandecoder.decode(char) html = html.replace(char, asciichar) return html
def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator from calibre.ebooks.chardet import detect from calibre.utils.zipfile import ZipFile from calibre.ebooks.txt.processor import ( convert_basic, convert_markdown_with_metadata, separate_paragraphs_single_line, separate_paragraphs_print_formatted, preserve_spaces, detect_paragraph_type, detect_formatting_type, normalize_line_endings, convert_textile, remove_indents, block_to_single_line, separate_hard_scene_breaks) self.log = log txt = '' log.debug('Reading text from file...') length = 0 # Extract content from zip archive. if file_ext == 'txtz': zf = ZipFile(stream) zf.extractall('.') for x in walk('.'): if os.path.splitext(x)[1].lower() in ('.txt', '.text'): with open(x, 'rb') as tf: txt += tf.read() + '\n\n' else: txt = stream.read() if file_ext in {'md', 'textile', 'markdown'}: options.formatting_type = { 'md': 'markdown' }.get(file_ext, file_ext) log.info('File extension indicates particular formatting. ' 'Forcing formatting type to: %s' % options.formatting_type) options.paragraph_type = 'off' # Get the encoding of the document. if options.input_encoding: ienc = options.input_encoding log.debug('Using user specified input encoding of %s' % ienc) else: det_encoding = detect(txt) det_encoding, confidence = det_encoding['encoding'], det_encoding[ 'confidence'] if det_encoding and det_encoding.lower().replace( '_', '-').strip() in ('gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn', 'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'): # Microsoft Word exports to HTML with encoding incorrectly set to # gb2312 instead of gbk. gbk is a superset of gb2312, anyway. det_encoding = 'gbk' ienc = det_encoding log.debug( 'Detected input encoding as %s with a confidence of %s%%' % (ienc, confidence * 100)) if not ienc: ienc = 'utf-8' log.debug( 'No input encoding specified and could not auto detect using %s' % ienc) # Remove BOM from start of txt as its presence can confuse markdown import codecs for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8, codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE): if txt.startswith(bom): txt = txt[len(bom):] break txt = txt.decode(ienc, 'replace') # Replace entities txt = _ent_pat.sub(xml_entity_to_unicode, txt) # Normalize line endings txt = normalize_line_endings(txt) # Determine the paragraph type of the document. if options.paragraph_type == 'auto': options.paragraph_type = detect_paragraph_type(txt) if options.paragraph_type == 'unknown': log.debug( 'Could not reliably determine paragraph type using block') options.paragraph_type = 'block' else: log.debug('Auto detected paragraph type as %s' % options.paragraph_type) # Detect formatting if options.formatting_type == 'auto': options.formatting_type = detect_formatting_type(txt) log.debug('Auto detected formatting as %s' % options.formatting_type) if options.formatting_type == 'heuristic': setattr(options, 'enable_heuristics', True) setattr(options, 'unwrap_lines', False) setattr(options, 'smarten_punctuation', True) # Reformat paragraphs to block formatting based on the detected type. # We don't check for block because the processor assumes block. # single and print at transformed to block for processing. if options.paragraph_type == 'single': txt = separate_paragraphs_single_line(txt) elif options.paragraph_type == 'print': txt = separate_hard_scene_breaks(txt) txt = separate_paragraphs_print_formatted(txt) txt = block_to_single_line(txt) elif options.paragraph_type == 'unformatted': from calibre.ebooks.conversion.utils import HeuristicProcessor # unwrap lines based on punctuation docanalysis = DocAnalysis('txt', txt) length = docanalysis.line_length(.5) preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None)) txt = preprocessor.punctuation_unwrap(length, txt, 'txt') txt = separate_paragraphs_single_line(txt) elif options.paragraph_type == 'block': txt = separate_hard_scene_breaks(txt) txt = block_to_single_line(txt) if getattr(options, 'enable_heuristics', False) and getattr( options, 'dehyphenate', False): docanalysis = DocAnalysis('txt', txt) if not length: length = docanalysis.line_length(.5) dehyphenator = Dehyphenator(options.verbose, log=self.log) txt = dehyphenator(txt, 'txt', length) # User requested transformation on the text. if options.txt_in_remove_indents: txt = remove_indents(txt) # Preserve spaces will replace multiple spaces to a space # followed by the entity. if options.preserve_spaces: txt = preserve_spaces(txt) # Process the text using the appropriate text processor. html = '' input_mi = None if options.formatting_type == 'markdown': log.debug('Running text through markdown conversion...') try: input_mi, html = convert_markdown_with_metadata( txt, extensions=[ x.strip() for x in options.markdown_extensions.split(',') if x.strip() ]) except RuntimeError: raise ValueError( 'This txt file has malformed markup, it cannot be' ' converted by calibre. See https://daringfireball.net/projects/markdown/syntax' ) elif options.formatting_type == 'textile': log.debug('Running text through textile conversion...') html = convert_textile(txt) else: log.debug('Running text through basic conversion...') flow_size = getattr(options, 'flow_size', 0) html = convert_basic(txt, epub_split_size_kb=flow_size) # Run the HTMLized text through the html processing plugin. from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) options.input_encoding = 'utf-8' base = os.getcwdu() if file_ext != 'txtz' and hasattr(stream, 'name'): base = os.path.dirname(stream.name) fname = os.path.join(base, 'index.html') c = 0 while os.path.exists(fname): c += 1 fname = 'index%d.html' % c htmlfile = open(fname, 'wb') with htmlfile: htmlfile.write(html.encode('utf-8')) odi = options.debug_pipeline options.debug_pipeline = None # Generate oeb from html conversion. oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html', log, {}) options.debug_pipeline = odi os.remove(htmlfile.name) # Set metadata from file. if input_mi is None: from calibre.customize.ui import get_file_type_metadata input_mi = get_file_type_metadata(stream, file_ext) from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata meta_info_to_oeb_metadata(input_mi, oeb.metadata, log) self.html_postprocess_title = input_mi.title return oeb