def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.txt.processor import convert_basic stdout = StringIO() ppdjvu = True # using djvutxt is MUCH faster, should make it an option if options.use_djvutxt and os.path.exists('/usr/bin/djvutxt'): from calibre.ptempfile import PersistentTemporaryFile try: fp = PersistentTemporaryFile(suffix='.djvu', prefix='djv_input') filename = fp._name fp.write(stream.read()) fp.close() cmd = ['djvutxt', filename] stdout.write( Popen(cmd, stdout=PIPE, close_fds=True).communicate()[0]) os.remove(filename) ppdjvu = False except: stream.seek(0) # retry with the pure python converter if ppdjvu: from calibre.ebooks.djvu.djvu import DJVUFile x = DJVUFile(stream) x.get_text(stdout) html = convert_basic(stdout.getvalue().replace(b"\n", b' ').replace( b'\037', b'\n\n')) # Run the HTMLized text through the html processing plugin. from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) options.input_encoding = 'utf-8' base = os.getcwdu() if file_ext != 'txtz' and hasattr(stream, 'name'): base = os.path.dirname(stream.name) fname = os.path.join(base, 'index.html') c = 0 while os.path.exists(fname): c += 1 fname = 'index%d.html' % c htmlfile = open(fname, 'wb') with htmlfile: htmlfile.write(html.encode('utf-8')) odi = options.debug_pipeline options.debug_pipeline = None # Generate oeb from html conversion. with open(htmlfile.name, 'rb') as f: oeb = html_input.convert(f, options, 'html', log, {}) options.debug_pipeline = odi os.remove(htmlfile.name) # Set metadata from file. from calibre.customize.ui import get_file_type_metadata from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata mi = get_file_type_metadata(stream, file_ext) meta_info_to_oeb_metadata(mi, oeb.metadata, log) return oeb
def _get_metadata(stream, stream_type, use_libprs_metadata, force_read_metadata=False, pattern=None): if stream_type: stream_type = stream_type.lower() if stream_type in ("html", "html", "xhtml", "xhtm", "xml"): stream_type = "html" if stream_type in ("mobi", "prc", "azw"): stream_type = "mobi" if stream_type in ("odt", "ods", "odp", "odg", "odf"): stream_type = "odt" opf = None if hasattr(stream, "name"): c = os.path.splitext(stream.name)[0] + ".opf" if os.access(c, os.R_OK): opf = opf_metadata(os.path.abspath(c)) if use_libprs_metadata and getattr(opf, "application_id", None) is not None: return opf name = os.path.basename(getattr(stream, "name", "")) # The fallback pattern matches the default filename format produced by calibre base = metadata_from_filename(name, pat=pattern, fallback_pat=re.compile(r"^(?P<title>.+) - (?P<author>[^-]+)$")) if not base.authors: base.authors = [_("Unknown")] if not base.title: base.title = _("Unknown") mi = MetaInformation(None, None) if force_read_metadata or prefs["read_file_metadata"]: mi = get_file_type_metadata(stream, stream_type) base.smart_update(mi) if opf is not None: base.smart_update(opf) return base
def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.txt.processor import convert_basic stdout = StringIO() ppdjvu = True # using djvutxt is MUCH faster, should make it an option if options.use_djvutxt and os.path.exists('/usr/bin/djvutxt'): from calibre.ptempfile import PersistentTemporaryFile try: fp = PersistentTemporaryFile(suffix='.djvu', prefix='djv_input') filename = fp._name fp.write(stream.read()) fp.close() cmd = ['djvutxt', filename] stdout.write(Popen(cmd, stdout=PIPE, close_fds=True).communicate()[0]) os.remove(filename) ppdjvu = False except: stream.seek(0) # retry with the pure python converter if ppdjvu: from calibre.ebooks.djvu.djvu import DJVUFile x = DJVUFile(stream) x.get_text(stdout) html = convert_basic(stdout.getvalue().replace(b"\n", b' ').replace( b'\037', b'\n\n')) # Run the HTMLized text through the html processing plugin. from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) options.input_encoding = 'utf-8' base = os.getcwdu() if file_ext != 'txtz' and hasattr(stream, 'name'): base = os.path.dirname(stream.name) fname = os.path.join(base, 'index.html') c = 0 while os.path.exists(fname): c += 1 fname = 'index%d.html'%c htmlfile = open(fname, 'wb') with htmlfile: htmlfile.write(html.encode('utf-8')) odi = options.debug_pipeline options.debug_pipeline = None # Generate oeb from html conversion. with open(htmlfile.name, 'rb') as f: oeb = html_input.convert(f, options, 'html', log, {}) options.debug_pipeline = odi os.remove(htmlfile.name) # Set metadata from file. from calibre.customize.ui import get_file_type_metadata from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata mi = get_file_type_metadata(stream, file_ext) meta_info_to_oeb_metadata(mi, oeb.metadata, log) return oeb
def _get_metadata(stream, stream_type, use_libprs_metadata, force_read_metadata=False, pattern=None): if stream_type: stream_type = stream_type.lower() if stream_type in ('html', 'html', 'xhtml', 'xhtm', 'xml'): stream_type = 'html' if stream_type in ('mobi', 'prc', 'azw'): stream_type = 'mobi' if stream_type in ('odt', 'ods', 'odp', 'odg', 'odf'): stream_type = 'odt' opf = None if hasattr(stream, 'name'): c = os.path.splitext(stream.name)[0] + '.opf' if os.access(c, os.R_OK): opf = opf_metadata(os.path.abspath(c)) if use_libprs_metadata and getattr(opf, 'application_id', None) is not None: return opf mi = MetaInformation(None, None) name = os.path.basename(getattr(stream, 'name', '')) base = metadata_from_filename(name, pat=pattern) if force_read_metadata or prefs['read_file_metadata']: mi = get_file_type_metadata(stream, stream_type) if base.title == os.path.splitext(name)[0] and \ base.is_null('authors') and base.is_null('isbn'): # Assume that there was no metadata in the file and the user set pattern # to match meta info from the file name did not match. # The regex is meant to match the standard format filenames are written # in the library title - author.extension base.smart_update( metadata_from_filename( name, re.compile(r'^(?P<title>.+)[ _]-[ _](?P<author>[^-]+)$'))) if base.title: base.title = base.title.replace('_', ' ') if base.authors: base.authors = [a.replace('_', ' ').strip() for a in base.authors] if not base.authors: base.authors = [_('Unknown')] if not base.title: base.title = _('Unknown') base.smart_update(mi) if opf is not None: base.smart_update(opf) return base
def _get_metadata(stream, stream_type, use_libprs_metadata, force_read_metadata=False, pattern=None): if stream_type: stream_type = stream_type.lower() if stream_type in ('html', 'html', 'xhtml', 'xhtm', 'xml'): stream_type = 'html' if stream_type in ('mobi', 'prc', 'azw'): stream_type = 'mobi' if stream_type in ('odt', 'ods', 'odp', 'odg', 'odf'): stream_type = 'odt' opf = None if hasattr(stream, 'name'): c = os.path.splitext(stream.name)[0]+'.opf' if os.access(c, os.R_OK): opf = opf_metadata(os.path.abspath(c)) if use_libprs_metadata and getattr(opf, 'application_id', None) is not None: return opf mi = MetaInformation(None, None) name = os.path.basename(getattr(stream, 'name', '')) base = metadata_from_filename(name, pat=pattern) if force_read_metadata or prefs['read_file_metadata']: mi = get_file_type_metadata(stream, stream_type) if base.title == os.path.splitext(name)[0] and \ base.is_null('authors') and base.is_null('isbn'): # Assume that there was no metadata in the file and the user set pattern # to match meta info from the file name did not match. # The regex is meant to match the standard format filenames are written # in the library title - author.extension base.smart_update(metadata_from_filename(name, re.compile( r'^(?P<title>.+)[ _]-[ _](?P<author>[^-]+)$'))) if base.title: base.title = base.title.replace('_', ' ') if base.authors: base.authors = [a.replace('_', ' ').strip() for a in base.authors] if not base.authors: base.authors = [_('Unknown')] if not base.title: base.title = _('Unknown') base.smart_update(mi) if opf is not None: base.smart_update(opf) return base
def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.txt.processor import convert_basic stdout = BytesIO() from calibre.ebooks.djvu.djvu import DJVUFile x = DJVUFile(stream) x.get_text(stdout) raw_text = stdout.getvalue() if not raw_text: raise ValueError( 'The DJVU file contains no text, only images, probably page scans.' ' calibre only supports conversion of DJVU files with actual text in them.' ) html = convert_basic( raw_text.replace(b"\n", b' ').replace(b'\037', b'\n\n')) # Run the HTMLized text through the html processing plugin. from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) options.input_encoding = 'utf-8' base = os.getcwd() htmlfile = os.path.join(base, 'index.html') c = 0 while os.path.exists(htmlfile): c += 1 htmlfile = os.path.join(base, 'index%d.html' % c) with open(htmlfile, 'wb') as f: f.write(html.encode('utf-8')) odi = options.debug_pipeline options.debug_pipeline = None # Generate oeb from html conversion. with open(htmlfile, 'rb') as f: oeb = html_input.convert(f, options, 'html', log, {}) options.debug_pipeline = odi os.remove(htmlfile) # Set metadata from file. from calibre.customize.ui import get_file_type_metadata from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata mi = get_file_type_metadata(stream, file_ext) meta_info_to_oeb_metadata(mi, oeb.metadata, log) return oeb
def _get_metadata(stream, stream_type, use_libprs_metadata, force_read_metadata=False, pattern=None): if stream_type: stream_type = stream_type.lower() if stream_type in ('html', 'html', 'xhtml', 'xhtm', 'xml'): stream_type = 'html' if stream_type in ('mobi', 'prc', 'azw'): stream_type = 'mobi' if stream_type in ('odt', 'ods', 'odp', 'odg', 'odf'): stream_type = 'odt' opf = None if hasattr(stream, 'name'): c = os.path.splitext(stream.name)[0] + '.opf' if os.access(c, os.R_OK): opf = opf_metadata(os.path.abspath(c)) if use_libprs_metadata and getattr(opf, 'application_id', None) is not None: return opf name = os.path.basename(getattr(stream, 'name', '')) # The fallback pattern matches the default filename format produced by calibre base = metadata_from_filename( name, pat=pattern, fallback_pat=regex.compile(r'^(?P<title>.+) - (?P<author>[^-]+)$', flags=regex.UNICODE | regex.VERSION1 | regex.FULLCASE)) if not base.authors: base.authors = [_('Unknown')] if not base.title: base.title = _('Unknown') mi = MetaInformation(None, None) if force_read_metadata or prefs['read_file_metadata']: mi = get_file_type_metadata(stream, stream_type) base.smart_update(mi) if opf is not None: base.smart_update(opf) return base
def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.txt.processor import convert_basic stdout = BytesIO() from calibre.ebooks.djvu.djvu import DJVUFile x = DJVUFile(stream) x.get_text(stdout) html = convert_basic(stdout.getvalue().replace(b"\n", b' ').replace( b'\037', b'\n\n')) # Run the HTMLized text through the html processing plugin. from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) options.input_encoding = 'utf-8' base = getcwd() fname = os.path.join(base, 'index.html') c = 0 while os.path.exists(fname): c += 1 fname = os.path.join(base, 'index%d.html'%c) htmlfile = open(fname, 'wb') with htmlfile: htmlfile.write(html.encode('utf-8')) odi = options.debug_pipeline options.debug_pipeline = None # Generate oeb from html conversion. with open(htmlfile.name, 'rb') as f: oeb = html_input.convert(f, options, 'html', log, {}) options.debug_pipeline = odi os.remove(htmlfile.name) # Set metadata from file. from calibre.customize.ui import get_file_type_metadata from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata mi = get_file_type_metadata(stream, file_ext) meta_info_to_oeb_metadata(mi, oeb.metadata, log) return oeb
def _get_metadata(stream, stream_type, use_libprs_metadata, force_read_metadata=False, pattern=None): if stream_type: stream_type = stream_type.lower() if stream_type in ('html', 'html', 'xhtml', 'xhtm', 'xml'): stream_type = 'html' if stream_type in ('mobi', 'prc', 'azw'): stream_type = 'mobi' if stream_type in ('odt', 'ods', 'odp', 'odg', 'odf'): stream_type = 'odt' opf = None if hasattr(stream, 'name'): c = os.path.splitext(stream.name)[0]+'.opf' if os.access(c, os.R_OK): opf = opf_metadata(os.path.abspath(c)) if use_libprs_metadata and getattr(opf, 'application_id', None) is not None: return opf name = os.path.basename(getattr(stream, 'name', '')) # The fallback pattern matches the default filename format produced by calibre base = metadata_from_filename(name, pat=pattern, fallback_pat=re.compile( r'^(?P<title>.+) - (?P<author>[^-]+)$')) if not base.authors: base.authors = [_('Unknown')] if not base.title: base.title = _('Unknown') mi = MetaInformation(None, None) if force_read_metadata or prefs['read_file_metadata']: mi = get_file_type_metadata(stream, stream_type) base.smart_update(mi) if opf is not None: base.smart_update(opf) return base
def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator from calibre.ebooks.chardet import detect from calibre.utils.zipfile import ZipFile from calibre.ebooks.txt.processor import ( convert_basic, convert_markdown_with_metadata, separate_paragraphs_single_line, separate_paragraphs_print_formatted, preserve_spaces, detect_paragraph_type, detect_formatting_type, normalize_line_endings, convert_textile, remove_indents, block_to_single_line, separate_hard_scene_breaks) self.log = log txt = '' log.debug('Reading text from file...') length = 0 # Extract content from zip archive. if file_ext == 'txtz': zf = ZipFile(stream) zf.extractall('.') for x in walk('.'): if os.path.splitext(x)[1].lower() in ('.txt', '.text'): with open(x, 'rb') as tf: txt += tf.read() + '\n\n' else: txt = stream.read() if file_ext in {'md', 'textile', 'markdown'}: options.formatting_type = { 'md': 'markdown' }.get(file_ext, file_ext) log.info('File extension indicates particular formatting. ' 'Forcing formatting type to: %s' % options.formatting_type) options.paragraph_type = 'off' # Get the encoding of the document. if options.input_encoding: ienc = options.input_encoding log.debug('Using user specified input encoding of %s' % ienc) else: det_encoding = detect(txt) det_encoding, confidence = det_encoding['encoding'], det_encoding[ 'confidence'] if det_encoding and det_encoding.lower().replace( '_', '-').strip() in ('gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn', 'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'): # Microsoft Word exports to HTML with encoding incorrectly set to # gb2312 instead of gbk. gbk is a superset of gb2312, anyway. det_encoding = 'gbk' ienc = det_encoding log.debug( 'Detected input encoding as %s with a confidence of %s%%' % (ienc, confidence * 100)) if not ienc: ienc = 'utf-8' log.debug( 'No input encoding specified and could not auto detect using %s' % ienc) # Remove BOM from start of txt as its presence can confuse markdown import codecs for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8, codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE): if txt.startswith(bom): txt = txt[len(bom):] break txt = txt.decode(ienc, 'replace') # Replace entities txt = _ent_pat.sub(xml_entity_to_unicode, txt) # Normalize line endings txt = normalize_line_endings(txt) # Determine the paragraph type of the document. if options.paragraph_type == 'auto': options.paragraph_type = detect_paragraph_type(txt) if options.paragraph_type == 'unknown': log.debug( 'Could not reliably determine paragraph type using block') options.paragraph_type = 'block' else: log.debug('Auto detected paragraph type as %s' % options.paragraph_type) # Detect formatting if options.formatting_type == 'auto': options.formatting_type = detect_formatting_type(txt) log.debug('Auto detected formatting as %s' % options.formatting_type) if options.formatting_type == 'heuristic': setattr(options, 'enable_heuristics', True) setattr(options, 'unwrap_lines', False) setattr(options, 'smarten_punctuation', True) # Reformat paragraphs to block formatting based on the detected type. # We don't check for block because the processor assumes block. # single and print at transformed to block for processing. if options.paragraph_type == 'single': txt = separate_paragraphs_single_line(txt) elif options.paragraph_type == 'print': txt = separate_hard_scene_breaks(txt) txt = separate_paragraphs_print_formatted(txt) txt = block_to_single_line(txt) elif options.paragraph_type == 'unformatted': from calibre.ebooks.conversion.utils import HeuristicProcessor # unwrap lines based on punctuation docanalysis = DocAnalysis('txt', txt) length = docanalysis.line_length(.5) preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None)) txt = preprocessor.punctuation_unwrap(length, txt, 'txt') txt = separate_paragraphs_single_line(txt) elif options.paragraph_type == 'block': txt = separate_hard_scene_breaks(txt) txt = block_to_single_line(txt) if getattr(options, 'enable_heuristics', False) and getattr( options, 'dehyphenate', False): docanalysis = DocAnalysis('txt', txt) if not length: length = docanalysis.line_length(.5) dehyphenator = Dehyphenator(options.verbose, log=self.log) txt = dehyphenator(txt, 'txt', length) # User requested transformation on the text. if options.txt_in_remove_indents: txt = remove_indents(txt) # Preserve spaces will replace multiple spaces to a space # followed by the entity. if options.preserve_spaces: txt = preserve_spaces(txt) # Process the text using the appropriate text processor. html = '' input_mi = None if options.formatting_type == 'markdown': log.debug('Running text through markdown conversion...') try: input_mi, html = convert_markdown_with_metadata( txt, extensions=[ x.strip() for x in options.markdown_extensions.split(',') if x.strip() ]) except RuntimeError: raise ValueError( 'This txt file has malformed markup, it cannot be' ' converted by calibre. See https://daringfireball.net/projects/markdown/syntax' ) elif options.formatting_type == 'textile': log.debug('Running text through textile conversion...') html = convert_textile(txt) else: log.debug('Running text through basic conversion...') flow_size = getattr(options, 'flow_size', 0) html = convert_basic(txt, epub_split_size_kb=flow_size) # Run the HTMLized text through the html processing plugin. from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) options.input_encoding = 'utf-8' base = os.getcwdu() if file_ext != 'txtz' and hasattr(stream, 'name'): base = os.path.dirname(stream.name) fname = os.path.join(base, 'index.html') c = 0 while os.path.exists(fname): c += 1 fname = 'index%d.html' % c htmlfile = open(fname, 'wb') with htmlfile: htmlfile.write(html.encode('utf-8')) odi = options.debug_pipeline options.debug_pipeline = None # Generate oeb from html conversion. oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html', log, {}) options.debug_pipeline = odi os.remove(htmlfile.name) # Set metadata from file. if input_mi is None: from calibre.customize.ui import get_file_type_metadata input_mi = get_file_type_metadata(stream, file_ext) from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata meta_info_to_oeb_metadata(input_mi, oeb.metadata, log) self.html_postprocess_title = input_mi.title return oeb
def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.metadata.opf2 import OPF from calibre.utils.zipfile import ZipFile self.log = log html = u'' top_levels = [] # Extract content from zip archive. zf = ZipFile(stream) zf.extractall() # Find the HTML file in the archive. It needs to be # top level. index = u'' multiple_html = False # Get a list of all top level files in the archive. for x in os.listdir(u'.'): if os.path.isfile(x): top_levels.append(x) # Try to find an index. file. for x in top_levels: if x.lower() in (u'index.html', u'index.xhtml', u'index.htm'): index = x break # Look for multiple HTML files in the archive. We look at the # top level files only as only they matter in HTMLZ. for x in top_levels: if os.path.splitext(x)[1].lower() in (u'.html', u'.xhtml', u'.htm'): # Set index to the first HTML file found if it's not # called index. if not index: index = x else: multiple_html = True # Warn the user if there multiple HTML file in the archive. HTMLZ # supports a single HTML file. A conversion with a multiple HTML file # HTMLZ archive probably won't turn out as the user expects. With # Multiple HTML files ZIP input should be used in place of HTMLZ. if multiple_html: log.warn( _('Multiple HTML files found in the archive. Only %s will be used.' ) % index) if index: with open(index, 'rb') as tf: html = tf.read() else: raise Exception(_('No top level HTML file found.')) if not html: raise Exception(_('Top level HTML file %s is empty') % index) # Encoding if options.input_encoding: ienc = options.input_encoding else: ienc = xml_to_unicode(html[:4096])[-1] html = html.decode(ienc, 'replace') # Run the HTML through the html processing plugin. from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) options.input_encoding = 'utf-8' base = getcwd() fname = os.path.join(base, u'index.html') c = 0 while os.path.exists(fname): c += 1 fname = u'index%d.html' % c htmlfile = open(fname, 'wb') with htmlfile: htmlfile.write(html.encode('utf-8')) odi = options.debug_pipeline options.debug_pipeline = None # Generate oeb from html conversion. oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html', log, {}) options.debug_pipeline = odi os.remove(htmlfile.name) # Set metadata from file. from calibre.customize.ui import get_file_type_metadata from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata mi = get_file_type_metadata(stream, file_ext) meta_info_to_oeb_metadata(mi, oeb.metadata, log) # Get the cover path from the OPF. cover_path = None opf = None for x in top_levels: if os.path.splitext(x)[1].lower() == u'.opf': opf = x break if opf: opf = OPF(opf, basedir=getcwd()) cover_path = opf.raster_cover or opf.cover # Set the cover. if cover_path: cdata = None with open(os.path.join(getcwd(), cover_path), 'rb') as cf: cdata = cf.read() cover_name = os.path.basename(cover_path) id, href = oeb.manifest.generate('cover', cover_name) oeb.manifest.add(id, href, guess_type(cover_name)[0], data=cdata) oeb.guide.add('cover', 'Cover', href) return oeb
def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.metadata.opf2 import OPF from calibre.utils.zipfile import ZipFile self.log = log html = u'' top_levels = [] # Extract content from zip archive. zf = ZipFile(stream) zf.extractall() # Find the HTML file in the archive. It needs to be # top level. index = u'' multiple_html = False # Get a list of all top level files in the archive. for x in os.listdir(u'.'): if os.path.isfile(x): top_levels.append(x) # Try to find an index. file. for x in top_levels: if x.lower() in (u'index.html', u'index.xhtml', u'index.htm'): index = x break # Look for multiple HTML files in the archive. We look at the # top level files only as only they matter in HTMLZ. for x in top_levels: if os.path.splitext(x)[1].lower() in (u'.html', u'.xhtml', u'.htm'): # Set index to the first HTML file found if it's not # called index. if not index: index = x else: multiple_html = True # Warn the user if there multiple HTML file in the archive. HTMLZ # supports a single HTML file. A conversion with a multiple HTML file # HTMLZ archive probably won't turn out as the user expects. With # Multiple HTML files ZIP input should be used in place of HTMLZ. if multiple_html: log.warn(_('Multiple HTML files found in the archive. Only %s will be used.') % index) if index: with open(index, 'rb') as tf: html = tf.read() else: raise Exception(_('No top level HTML file found.')) if not html: raise Exception(_('Top level HTML file %s is empty') % index) # Encoding if options.input_encoding: ienc = options.input_encoding else: ienc = xml_to_unicode(html[:4096])[-1] html = html.decode(ienc, 'replace') # Run the HTML through the html processing plugin. from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) options.input_encoding = 'utf-8' base = os.getcwdu() fname = os.path.join(base, u'index.html') c = 0 while os.path.exists(fname): c += 1 fname = u'index%d.html'%c htmlfile = open(fname, 'wb') with htmlfile: htmlfile.write(html.encode('utf-8')) odi = options.debug_pipeline options.debug_pipeline = None # Generate oeb from html conversion. oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html', log, {}) options.debug_pipeline = odi os.remove(htmlfile.name) # Set metadata from file. from calibre.customize.ui import get_file_type_metadata from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata mi = get_file_type_metadata(stream, file_ext) meta_info_to_oeb_metadata(mi, oeb.metadata, log) # Get the cover path from the OPF. cover_path = None opf = None for x in top_levels: if os.path.splitext(x)[1].lower() == u'.opf': opf = x break if opf: opf = OPF(opf, basedir=os.getcwdu()) cover_path = opf.raster_cover or opf.cover # Set the cover. if cover_path: cdata = None with open(os.path.join(os.getcwdu(), cover_path), 'rb') as cf: cdata = cf.read() cover_name = os.path.basename(cover_path) id, href = oeb.manifest.generate('cover', cover_name) oeb.manifest.add(id, href, guess_type(cover_name)[0], data=cdata) oeb.guide.add('cover', 'Cover', href) return oeb
def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator from calibre.ebooks.chardet import detect from calibre.utils.zipfile import ZipFile from calibre.ebooks.txt.processor import (convert_basic, convert_markdown_with_metadata, separate_paragraphs_single_line, separate_paragraphs_print_formatted, preserve_spaces, detect_paragraph_type, detect_formatting_type, normalize_line_endings, convert_textile, remove_indents, block_to_single_line, separate_hard_scene_breaks) self.log = log txt = '' log.debug('Reading text from file...') length = 0 # Extract content from zip archive. if file_ext == 'txtz': zf = ZipFile(stream) zf.extractall('.') for x in walk('.'): if os.path.splitext(x)[1].lower() in ('.txt', '.text'): with open(x, 'rb') as tf: txt += tf.read() + '\n\n' else: txt = stream.read() if file_ext in {'md', 'textile', 'markdown'}: options.formatting_type = {'md': 'markdown'}.get(file_ext, file_ext) log.info('File extension indicates particular formatting. ' 'Forcing formatting type to: %s'%options.formatting_type) options.paragraph_type = 'off' # Get the encoding of the document. if options.input_encoding: ienc = options.input_encoding log.debug('Using user specified input encoding of %s' % ienc) else: det_encoding = detect(txt[:4096]) det_encoding, confidence = det_encoding['encoding'], det_encoding['confidence'] if det_encoding and det_encoding.lower().replace('_', '-').strip() in ( 'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn', 'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'): # Microsoft Word exports to HTML with encoding incorrectly set to # gb2312 instead of gbk. gbk is a superset of gb2312, anyway. det_encoding = 'gbk' ienc = det_encoding log.debug('Detected input encoding as %s with a confidence of %s%%' % (ienc, confidence * 100)) if not ienc: ienc = 'utf-8' log.debug('No input encoding specified and could not auto detect using %s' % ienc) # Remove BOM from start of txt as its presence can confuse markdown import codecs for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8, codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE): if txt.startswith(bom): txt = txt[len(bom):] break txt = txt.decode(ienc, 'replace') # Replace entities txt = _ent_pat.sub(xml_entity_to_unicode, txt) # Normalize line endings txt = normalize_line_endings(txt) # Determine the paragraph type of the document. if options.paragraph_type == 'auto': options.paragraph_type = detect_paragraph_type(txt) if options.paragraph_type == 'unknown': log.debug('Could not reliably determine paragraph type using block') options.paragraph_type = 'block' else: log.debug('Auto detected paragraph type as %s' % options.paragraph_type) # Detect formatting if options.formatting_type == 'auto': options.formatting_type = detect_formatting_type(txt) log.debug('Auto detected formatting as %s' % options.formatting_type) if options.formatting_type == 'heuristic': setattr(options, 'enable_heuristics', True) setattr(options, 'unwrap_lines', False) setattr(options, 'smarten_punctuation', True) # Reformat paragraphs to block formatting based on the detected type. # We don't check for block because the processor assumes block. # single and print at transformed to block for processing. if options.paragraph_type == 'single': txt = separate_paragraphs_single_line(txt) elif options.paragraph_type == 'print': txt = separate_hard_scene_breaks(txt) txt = separate_paragraphs_print_formatted(txt) txt = block_to_single_line(txt) elif options.paragraph_type == 'unformatted': from calibre.ebooks.conversion.utils import HeuristicProcessor # unwrap lines based on punctuation docanalysis = DocAnalysis('txt', txt) length = docanalysis.line_length(.5) preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None)) txt = preprocessor.punctuation_unwrap(length, txt, 'txt') txt = separate_paragraphs_single_line(txt) elif options.paragraph_type == 'block': txt = separate_hard_scene_breaks(txt) txt = block_to_single_line(txt) if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False): docanalysis = DocAnalysis('txt', txt) if not length: length = docanalysis.line_length(.5) dehyphenator = Dehyphenator(options.verbose, log=self.log) txt = dehyphenator(txt,'txt', length) # User requested transformation on the text. if options.txt_in_remove_indents: txt = remove_indents(txt) # Preserve spaces will replace multiple spaces to a space # followed by the entity. if options.preserve_spaces: txt = preserve_spaces(txt) # Process the text using the appropriate text processor. html = '' input_mi = None if options.formatting_type == 'markdown': log.debug('Running text through markdown conversion...') try: input_mi, html = convert_markdown_with_metadata(txt, extensions=[x.strip() for x in options.markdown_extensions.split(',') if x.strip()]) except RuntimeError: raise ValueError('This txt file has malformed markup, it cannot be' ' converted by calibre. See https://daringfireball.net/projects/markdown/syntax') elif options.formatting_type == 'textile': log.debug('Running text through textile conversion...') html = convert_textile(txt) else: log.debug('Running text through basic conversion...') flow_size = getattr(options, 'flow_size', 0) html = convert_basic(txt, epub_split_size_kb=flow_size) # Run the HTMLized text through the html processing plugin. from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) options.input_encoding = 'utf-8' base = os.getcwdu() if file_ext != 'txtz' and hasattr(stream, 'name'): base = os.path.dirname(stream.name) fname = os.path.join(base, 'index.html') c = 0 while os.path.exists(fname): c += 1 fname = 'index%d.html'%c htmlfile = open(fname, 'wb') with htmlfile: htmlfile.write(html.encode('utf-8')) odi = options.debug_pipeline options.debug_pipeline = None # Generate oeb from html conversion. oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html', log, {}) options.debug_pipeline = odi os.remove(htmlfile.name) # Set metadata from file. if input_mi is None: from calibre.customize.ui import get_file_type_metadata input_mi = get_file_type_metadata(stream, file_ext) from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata meta_info_to_oeb_metadata(input_mi, oeb.metadata, log) self.html_postprocess_title = input_mi.title return oeb