class DOCXInput(InputFormatPlugin): name = 'DOCX Input' author = 'Kovid Goyal' description = _('Convert DOCX files (.docx and .docm) to HTML') file_types = {'docx', 'docm'} options = { OptionRecommendation( name='docx_no_cover', recommended_value=False, help= _('Normally, if a large image is present at the start of the document that looks like a cover, ' 'it will be removed from the document and used as the cover for created ebook. This option ' 'turns off that behavior.')), OptionRecommendation( name='docx_no_pagebreaks_between_notes', recommended_value=False, help=_('Do not insert a page break after every endnote.')), } recommendations = set([('page_breaks_before', '/', OptionRecommendation.MED)]) def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.docx.to_html import Convert return Convert(stream, detect_cover=not options.docx_no_cover, log=log, notes_nopb=options.docx_no_pagebreaks_between_notes)()
class PDBOutput(OutputFormatPlugin): name = 'PDB Output' author = 'John Schember' file_type = 'pdb' commit_name = 'pdb_output' ui_data = {'formats': tuple(ALL_FORMAT_WRITERS)} options = set([ OptionRecommendation( name='format', recommended_value='doc', level=OptionRecommendation.LOW, short_switch='f', choices=list(ALL_FORMAT_WRITERS), help=(_('Format to use inside the pdb container. Choices are:') + ' %s' % sorted(ALL_FORMAT_WRITERS))), OptionRecommendation( name='pdb_output_encoding', recommended_value='cp1252', level=OptionRecommendation.LOW, help=_( 'Specify the character encoding of the output document. ' 'The default is cp1252. Note: This option is not honored by all ' 'formats.')), OptionRecommendation( name='inline_toc', recommended_value=False, level=OptionRecommendation.LOW, help=_('Add Table of Contents to beginning of the book.')), ]) def convert(self, oeb_book, output_path, input_plugin, opts, log): close = False if not hasattr(output_path, 'write'): close = True if not os.path.exists(os.path.dirname( output_path)) and os.path.dirname(output_path) != '': os.makedirs(os.path.dirname(output_path)) out_stream = open(output_path, 'wb') else: out_stream = output_path Writer = get_writer(opts.format) if Writer is None: raise PDBError('No writer available for format %s.' % format) setattr(opts, 'max_line_length', 0) setattr(opts, 'force_max_line_length', False) writer = Writer(opts, log) out_stream.seek(0) out_stream.truncate() writer.write_content(oeb_book, out_stream, oeb_book.metadata) if close: out_stream.close()
class DOCXOutput(OutputFormatPlugin): name = 'DOCX Output' author = 'Kovid Goyal' file_type = 'docx' options = { OptionRecommendation(name='docx_page_size', recommended_value='letter', level=OptionRecommendation.LOW, choices=PAGE_SIZES, help=_('The size of the page. Default is letter. Choices ' 'are %s') % PAGE_SIZES), OptionRecommendation(name='docx_custom_page_size', recommended_value=None, help=_('Custom size of the document. Use the form widthxheight ' 'EG. `123x321` to specify the width and height (in pts). ' 'This overrides any specified page-size.')), OptionRecommendation(name='docx_no_cover', recommended_value=False, help=_('Do not insert the book cover as an image at the start of the document.' ' If you use this option, the book cover will be discarded.')), OptionRecommendation(name='docx_no_toc', recommended_value=False, help=_('Do not insert the table of contents as a page at the start of the document.')), OptionRecommendation(name='extract_to', help=_('Extract the contents of the generated %s file to the ' 'specified directory. The contents of the directory are first ' 'deleted, so be careful.') % 'DOCX'), } recommendations = { ('margin_left', 72.0, OptionRecommendation.MED), ('margin_right', 72.0, OptionRecommendation.MED), ('margin_top', 72.0, OptionRecommendation.MED), ('margin_bottom', 72.0, OptionRecommendation.MED), } def convert_metadata(self, oeb): from lxml import etree from calibre.ebooks.oeb.base import OPF, OPF2_NS from calibre.ebooks.metadata.opf2 import OPF as ReadOPF from io import BytesIO package = etree.Element(OPF('package'), attrib={'version': '2.0'}, nsmap={None: OPF2_NS}) oeb.metadata.to_opf2(package) self.mi = ReadOPF(BytesIO(etree.tostring(package, encoding='utf-8')), populate_spine=False, try_to_guess_cover=False).to_book_metadata() def convert(self, oeb, output_path, input_plugin, opts, log): from calibre.ebooks.docx.writer.container import DOCX from calibre.ebooks.docx.writer.from_html import Convert docx = DOCX(opts, log) self.convert_metadata(oeb) Convert(oeb, docx, self.mi, not opts.docx_no_cover, not opts.docx_no_toc)() docx.write(output_path, self.mi) if opts.extract_to: from calibre.ebooks.docx.dump import do_dump do_dump(output_path, opts.extract_to)
class RBOutput(OutputFormatPlugin): name = 'RB Output' author = 'John Schember' file_type = 'rb' commit_name = 'rb_output' options = set([ OptionRecommendation(name='inline_toc', recommended_value=False, level=OptionRecommendation.LOW, help=_('Add Table of Contents to beginning of the book.')), ]) def convert(self, oeb_book, output_path, input_plugin, opts, log): from calibre.ebooks.rb.writer import RBWriter close = False if not hasattr(output_path, 'write'): close = True if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '': os.makedirs(os.path.dirname(output_path)) out_stream = open(output_path, 'wb') else: out_stream = output_path writer = RBWriter(opts, log) out_stream.seek(0) out_stream.truncate() writer.write_content(oeb_book, out_stream, oeb_book.metadata) if close: out_stream.close()
class DOCXOutput(OutputFormatPlugin): name = 'DOCX Output' author = 'Kovid Goyal' file_type = 'docx' options = { OptionRecommendation( name='docx_page_size', recommended_value='letter', level=OptionRecommendation.LOW, choices=PAGE_SIZES, help=_('The size of the page. Default is letter. Choices ' 'are %s') % PAGE_SIZES), OptionRecommendation( name='docx_custom_page_size', recommended_value=None, help=_('Custom size of the document. Use the form widthxheight ' 'EG. `123x321` to specify the width and height (in pts). ' 'This overrides any specified page-size.')), OptionRecommendation( name='extract_to', help=_( 'Extract the contents of the generated %s file to the ' 'specified directory. The contents of the directory are first ' 'deleted, so be careful.') % 'DOCX'), } def convert(self, oeb, output_path, input_plugin, opts, log): from calibre.ebooks.docx.writer.container import DOCX from calibre.ebooks.docx.writer.from_html import Convert docx = DOCX(opts, log) Convert(oeb, docx)() docx.write(output_path, oeb) if opts.extract_to: from calibre.ebooks.docx.dump import do_dump do_dump(output_path, opts.extract_to)
class TCROutput(OutputFormatPlugin): name = 'TCR Output' author = 'John Schember' file_type = 'tcr' commit_name = 'tcr_output' options = { OptionRecommendation( name='tcr_output_encoding', recommended_value='utf-8', level=OptionRecommendation.LOW, help=_('Specify the character encoding of the output document. ' 'The default is utf-8.')) } def convert(self, oeb_book, output_path, input_plugin, opts, log): from calibre.ebooks.txt.txtml import TXTMLizer from calibre.ebooks.compression.tcr import compress close = False if not hasattr(output_path, 'write'): close = True if not os.path.exists(os.path.dirname( output_path)) and os.path.dirname(output_path) != '': os.makedirs(os.path.dirname(output_path)) out_stream = open(output_path, 'wb') else: out_stream = output_path setattr(opts, 'flush_paras', False) setattr(opts, 'max_line_length', 0) setattr(opts, 'force_max_line_length', False) setattr(opts, 'indent_paras', False) writer = TXTMLizer(log) txt = writer.extract_content(oeb_book, opts).encode(opts.tcr_output_encoding, 'replace') log.info('Compressing text...') txt = compress(txt) out_stream.seek(0) out_stream.truncate() out_stream.write(txt) if close: out_stream.close()
class KindleComics(InputFormatPlugin): name = 'Kindle Comics Input' author = 'Pavel Zwerschke' supported_platforms = ['windows', 'osx', 'linux'] file_types = {'cbz', 'cbr'} version = (0, 0, 2) description = 'Converts cbz and cbr files into a kindle format that is actually readable on Kindle devices.' minimum_calibre_version = (5, 0, 0) options = { OptionRecommendation(name='manga', recommended_value=False, help='Used for right-to-left publications like manga.'), OptionRecommendation(name='webtoon', recommended_value=False, help='Used for korean webtoons.'), OptionRecommendation(name='margins', choices=['auto', 'black', 'white'], recommended_value='auto', help='What color should the margins have.'), OptionRecommendation(name='no_greyscale', recommended_value=False, help='Don\'t convert the image to grayscale (black and white).'), OptionRecommendation(name='max_width', recommended_value="1264", help='Maximum width.'), OptionRecommendation(name='max_height', recommended_value="1680", help='Maximum height.'), OptionRecommendation(name='gamma', recommended_value="1.0", help='Gamma correction. 0 means automatic.') } def gui_configuration_widget(self, parent, get_option_by_name, get_option_help, db, book_id=None): from calibre_plugins.kindle_comics.kindle_comics_input import PluginWidget return PluginWidget(parent, get_option_by_name, get_option_help, db, book_id) def convert(self, stream, options, file_ext, log, accelerators): from calibre_plugins.kindle_comics.make_book import make_book book = os.path.abspath(stream.name) stream.close() opt_file = make_book(_convert_options_to_dict(options), book, log) return opt_file
class TXTInput(InputFormatPlugin): name = 'TXT Input' author = 'John Schember' description = 'Convert TXT files to HTML' file_types = {'txt', 'txtz', 'text', 'md', 'textile', 'markdown'} options = set([ OptionRecommendation( name='paragraph_type', recommended_value='auto', choices=['auto', 'block', 'single', 'print', 'unformatted', 'off'], help= _('Paragraph structure.\n' 'choices are [\'auto\', \'block\', \'single\', \'print\', \'unformatted\', \'off\']\n' '* auto: Try to auto detect paragraph type.\n' '* block: Treat a blank line as a paragraph break.\n' '* single: Assume every line is a paragraph.\n' '* print: Assume every line starting with 2+ spaces or a tab ' 'starts a paragraph.\n' '* unformatted: Most lines have hard line breaks, few/no blank lines or indents. ' 'Tries to determine structure and reformat the differentiate elements.\n' '* off: Don\'t modify the paragraph structure. This is useful when combined with ' 'Markdown or Textile formatting to ensure no formatting is lost.' )), OptionRecommendation( name='formatting_type', recommended_value='auto', choices=['auto', 'plain', 'heuristic', 'textile', 'markdown'], help= _('Formatting used within the document.' '* auto: Automatically decide which formatting processor to use.\n' '* plain: Do not process the document formatting. Everything is a ' 'paragraph and no styling is applied.\n' '* heuristic: Process using heuristics to determine formatting such ' 'as chapter headings and italic text.\n' '* textile: Processing using textile formatting.\n' '* markdown: Processing using markdown formatting. ' 'To learn more about markdown see') + ' https://daringfireball.net/projects/markdown/'), OptionRecommendation( name='preserve_spaces', recommended_value=False, help=_('Normally extra spaces are condensed into a single space. ' 'With this option all spaces will be displayed.')), OptionRecommendation( name='txt_in_remove_indents', recommended_value=False, help=_( 'Normally extra space at the beginning of lines is retained. ' 'With this option they will be removed.')), OptionRecommendation( name="markdown_extensions", recommended_value='footnotes, tables, toc', help= _('Enable extensions to markdown syntax. Extensions are formatting that is not part ' 'of the standard markdown format. The extensions enabled by default: %default.\n' 'To learn more about markdown extensions, see https://pythonhosted.org/Markdown/extensions/index.html\n' 'This should be a comma separated list of extensions to enable:\n' ) + '\n'.join('* %s: %s' % (k, MD_EXTENSIONS[k]) for k in sorted(MD_EXTENSIONS))), ]) def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator from calibre.ebooks.chardet import detect from calibre.utils.zipfile import ZipFile from calibre.ebooks.txt.processor import ( convert_basic, convert_markdown_with_metadata, separate_paragraphs_single_line, separate_paragraphs_print_formatted, preserve_spaces, detect_paragraph_type, detect_formatting_type, normalize_line_endings, convert_textile, remove_indents, block_to_single_line, separate_hard_scene_breaks) self.log = log txt = '' log.debug('Reading text from file...') length = 0 # Extract content from zip archive. if file_ext == 'txtz': zf = ZipFile(stream) zf.extractall('.') for x in walk('.'): if os.path.splitext(x)[1].lower() in ('.txt', '.text'): with open(x, 'rb') as tf: txt += tf.read() + '\n\n' else: txt = stream.read() if file_ext in {'md', 'textile', 'markdown'}: options.formatting_type = { 'md': 'markdown' }.get(file_ext, file_ext) log.info('File extension indicates particular formatting. ' 'Forcing formatting type to: %s' % options.formatting_type) options.paragraph_type = 'off' # Get the encoding of the document. if options.input_encoding: ienc = options.input_encoding log.debug('Using user specified input encoding of %s' % ienc) else: det_encoding = detect(txt) det_encoding, confidence = det_encoding['encoding'], det_encoding[ 'confidence'] if det_encoding and det_encoding.lower().replace( '_', '-').strip() in ('gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn', 'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'): # Microsoft Word exports to HTML with encoding incorrectly set to # gb2312 instead of gbk. gbk is a superset of gb2312, anyway. det_encoding = 'gbk' ienc = det_encoding log.debug( 'Detected input encoding as %s with a confidence of %s%%' % (ienc, confidence * 100)) if not ienc: ienc = 'utf-8' log.debug( 'No input encoding specified and could not auto detect using %s' % ienc) # Remove BOM from start of txt as its presence can confuse markdown import codecs for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8, codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE): if txt.startswith(bom): txt = txt[len(bom):] break txt = txt.decode(ienc, 'replace') # Replace entities txt = _ent_pat.sub(xml_entity_to_unicode, txt) # Normalize line endings txt = normalize_line_endings(txt) # Determine the paragraph type of the document. if options.paragraph_type == 'auto': options.paragraph_type = detect_paragraph_type(txt) if options.paragraph_type == 'unknown': log.debug( 'Could not reliably determine paragraph type using block') options.paragraph_type = 'block' else: log.debug('Auto detected paragraph type as %s' % options.paragraph_type) # Detect formatting if options.formatting_type == 'auto': options.formatting_type = detect_formatting_type(txt) log.debug('Auto detected formatting as %s' % options.formatting_type) if options.formatting_type == 'heuristic': setattr(options, 'enable_heuristics', True) setattr(options, 'unwrap_lines', False) setattr(options, 'smarten_punctuation', True) # Reformat paragraphs to block formatting based on the detected type. # We don't check for block because the processor assumes block. # single and print at transformed to block for processing. if options.paragraph_type == 'single': txt = separate_paragraphs_single_line(txt) elif options.paragraph_type == 'print': txt = separate_hard_scene_breaks(txt) txt = separate_paragraphs_print_formatted(txt) txt = block_to_single_line(txt) elif options.paragraph_type == 'unformatted': from calibre.ebooks.conversion.utils import HeuristicProcessor # unwrap lines based on punctuation docanalysis = DocAnalysis('txt', txt) length = docanalysis.line_length(.5) preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None)) txt = preprocessor.punctuation_unwrap(length, txt, 'txt') txt = separate_paragraphs_single_line(txt) elif options.paragraph_type == 'block': txt = separate_hard_scene_breaks(txt) txt = block_to_single_line(txt) if getattr(options, 'enable_heuristics', False) and getattr( options, 'dehyphenate', False): docanalysis = DocAnalysis('txt', txt) if not length: length = docanalysis.line_length(.5) dehyphenator = Dehyphenator(options.verbose, log=self.log) txt = dehyphenator(txt, 'txt', length) # User requested transformation on the text. if options.txt_in_remove_indents: txt = remove_indents(txt) # Preserve spaces will replace multiple spaces to a space # followed by the entity. if options.preserve_spaces: txt = preserve_spaces(txt) # Process the text using the appropriate text processor. html = '' input_mi = None if options.formatting_type == 'markdown': log.debug('Running text through markdown conversion...') try: input_mi, html = convert_markdown_with_metadata( txt, extensions=[ x.strip() for x in options.markdown_extensions.split(',') if x.strip() ]) except RuntimeError: raise ValueError( 'This txt file has malformed markup, it cannot be' ' converted by calibre. See https://daringfireball.net/projects/markdown/syntax' ) elif options.formatting_type == 'textile': log.debug('Running text through textile conversion...') html = convert_textile(txt) else: log.debug('Running text through basic conversion...') flow_size = getattr(options, 'flow_size', 0) html = convert_basic(txt, epub_split_size_kb=flow_size) # Run the HTMLized text through the html processing plugin. from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) options.input_encoding = 'utf-8' base = os.getcwdu() if file_ext != 'txtz' and hasattr(stream, 'name'): base = os.path.dirname(stream.name) fname = os.path.join(base, 'index.html') c = 0 while os.path.exists(fname): c += 1 fname = 'index%d.html' % c htmlfile = open(fname, 'wb') with htmlfile: htmlfile.write(html.encode('utf-8')) odi = options.debug_pipeline options.debug_pipeline = None # Generate oeb from html conversion. oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html', log, {}) options.debug_pipeline = odi os.remove(htmlfile.name) # Set metadata from file. if input_mi is None: from calibre.customize.ui import get_file_type_metadata input_mi = get_file_type_metadata(stream, file_ext) from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata meta_info_to_oeb_metadata(input_mi, oeb.metadata, log) self.html_postprocess_title = input_mi.title return oeb def postprocess_book(self, oeb, opts, log): for item in oeb.spine: if hasattr(item.data, 'xpath'): for title in item.data.xpath('//*[local-name()="title"]'): if title.text == _('Unknown'): title.text = self.html_postprocess_title
class PDFInput(InputFormatPlugin): name = 'PDF Input' author = 'Kovid Goyal and John Schember' description = _('Convert PDF files to HTML') file_types = {'pdf'} commit_name = 'pdf_input' options = { OptionRecommendation( name='no_images', recommended_value=False, help=_('Do not extract images from the document')), OptionRecommendation( name='unwrap_factor', recommended_value=0.45, help=_( 'Scale used to determine the length at which a line should ' 'be unwrapped. Valid values are a decimal between 0 and 1. The ' 'default is 0.45, just below the median line length.')), OptionRecommendation( name='new_pdf_engine', recommended_value=False, help=_( 'Use the new PDF conversion engine. Currently not operational.' )) } def convert_new(self, stream, accelerators): from calibre.ebooks.pdf.pdftohtml import pdftohtml from calibre.utils.cleantext import clean_ascii_chars from calibre.ebooks.pdf.reflow import PDFDocument pdftohtml(os.getcwd(), stream.name, self.opts.no_images, as_xml=True) with lopen('index.xml', 'rb') as f: xml = clean_ascii_chars(f.read()) PDFDocument(xml, self.opts, self.log) return os.path.join(os.getcwd(), 'metadata.opf') def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.pdf.pdftohtml import pdftohtml log.debug('Converting file to html...') # The main html file will be named index.html self.opts, self.log = options, log if options.new_pdf_engine: return self.convert_new(stream, accelerators) pdftohtml(os.getcwd(), stream.name, options.no_images) from calibre.ebooks.metadata.meta import get_metadata log.debug('Retrieving document metadata...') mi = get_metadata(stream, 'pdf') opf = OPFCreator(os.getcwd(), mi) manifest = [('index.html', None)] images = os.listdir(os.getcwd()) images.remove('index.html') for i in images: manifest.append((i, None)) log.debug('Generating manifest...') opf.create_manifest(manifest) opf.create_spine(['index.html']) log.debug('Rendering manifest...') with lopen('metadata.opf', 'wb') as opffile: opf.render(opffile) if os.path.exists('toc.ncx'): ncxid = opf.manifest.id_for_path('toc.ncx') if ncxid: with lopen('metadata.opf', 'r+b') as f: raw = f.read().replace( b'<spine', b'<spine toc="%s"' % as_bytes(ncxid)) f.seek(0) f.write(raw) return os.path.join(os.getcwd(), 'metadata.opf')
class EPUBOutput(OutputFormatPlugin): name = 'EPUB Output' author = 'Kovid Goyal' file_type = 'epub' options = set([ OptionRecommendation( name='extract_to', help=_( 'Extract the contents of the generated %s file to the ' 'specified directory. The contents of the directory are first ' 'deleted, so be careful.') % 'EPUB'), OptionRecommendation( name='dont_split_on_page_breaks', recommended_value=False, level=OptionRecommendation.LOW, help=_('Turn off splitting at page breaks. Normally, input ' 'files are automatically split at every page break into ' 'two files. This gives an output e-book that can be ' 'parsed faster and with less resources. However, ' 'splitting is slow and if your source file contains a ' 'very large number of page breaks, you should turn off ' 'splitting on page breaks.')), OptionRecommendation( name='flow_size', recommended_value=260, help=_( 'Split all HTML files larger than this size (in KB). ' 'This is necessary as most EPUB readers cannot handle large ' 'file sizes. The default of %defaultKB is the size required ' 'for Adobe Digital Editions. Set to 0 to disable size based splitting.' )), OptionRecommendation( name='no_default_epub_cover', recommended_value=False, help=_( 'Normally, if the input file has no cover and you don\'t' ' specify one, a default cover is generated with the title, ' 'authors, etc. This option disables the generation of this cover.' )), OptionRecommendation( name='no_svg_cover', recommended_value=False, help=_('Do not use SVG for the book cover. Use this option if ' 'your EPUB is going to be used on a device that does not ' 'support SVG, like the iPhone or the JetBook Lite. ' 'Without this option, such devices will display the cover ' 'as a blank page.')), OptionRecommendation( name='preserve_cover_aspect_ratio', recommended_value=False, help= _('When using an SVG cover, this option will cause the cover to scale ' 'to cover the available screen area, but still preserve its aspect ratio ' '(ratio of width to height). That means there may be white borders ' 'at the sides or top and bottom of the image, but the image will ' 'never be distorted. Without this option the image may be slightly ' 'distorted, but there will be no borders.')), OptionRecommendation( name='epub_flatten', recommended_value=False, help=_( 'This option is needed only if you intend to use the EPUB' ' with FBReaderJ. It will flatten the file system inside the' ' EPUB, putting all files into the top level.')), OptionRecommendation( name='epub_inline_toc', recommended_value=False, help= _('Insert an inline Table of Contents that will appear as part of the main book content.' )), OptionRecommendation( name='epub_toc_at_end', recommended_value=False, help= _('Put the inserted inline Table of Contents at the end of the book instead of the start.' )), OptionRecommendation( name='toc_title', recommended_value=None, help=_('Title for any generated in-line table of contents.')), ]) recommendations = set([('pretty_print', True, OptionRecommendation.HIGH)]) def workaround_webkit_quirks(self): # {{{ from calibre.ebooks.oeb.base import XPath for x in self.oeb.spine: root = x.data body = XPath('//h:body')(root) if body: body = body[0] if not hasattr(body, 'xpath'): continue for pre in XPath('//h:pre')(body): if not pre.text and len(pre) == 0: pre.tag = 'div' # }}} def upshift_markup(self): # {{{ 'Upgrade markup to comply with XHTML 1.1 where possible' from calibre.ebooks.oeb.base import XPath, XML for x in self.oeb.spine: root = x.data if (not root.get(XML('lang'))) and (root.get('lang')): root.set(XML('lang'), root.get('lang')) body = XPath('//h:body')(root) if body: body = body[0] if not hasattr(body, 'xpath'): continue for u in XPath('//h:u')(root): u.tag = 'span' seen_ids, seen_names = set(), set() for x in XPath('//*[@id or @name]')(root): eid, name = x.get('id', None), x.get('name', None) if eid: if eid in seen_ids: del x.attrib['id'] else: seen_ids.add(eid) if name: if name in seen_names: del x.attrib['name'] else: seen_names.add(name) # }}} def convert(self, oeb, output_path, input_plugin, opts, log): self.log, self.opts, self.oeb = log, opts, oeb if self.opts.epub_inline_toc: from calibre.ebooks.mobi.writer8.toc import TOCAdder opts.mobi_toc_at_start = not opts.epub_toc_at_end opts.mobi_passthrough = False opts.no_inline_toc = False TOCAdder(oeb, opts, replace_previous_inline_toc=True, ignore_existing_toc=True) if self.opts.epub_flatten: from calibre.ebooks.oeb.transforms.filenames import FlatFilenames FlatFilenames()(oeb, opts) else: from calibre.ebooks.oeb.transforms.filenames import UniqueFilenames UniqueFilenames()(oeb, opts) self.workaround_ade_quirks() self.workaround_webkit_quirks() self.upshift_markup() from calibre.ebooks.oeb.transforms.rescale import RescaleImages RescaleImages(check_colorspaces=True)(oeb, opts) from calibre.ebooks.oeb.transforms.split import Split split = Split(not self.opts.dont_split_on_page_breaks, max_flow_size=self.opts.flow_size * 1024) split(self.oeb, self.opts) from calibre.ebooks.oeb.transforms.cover import CoverManager cm = CoverManager( no_default_cover=self.opts.no_default_epub_cover, no_svg_cover=self.opts.no_svg_cover, preserve_aspect_ratio=self.opts.preserve_cover_aspect_ratio) cm(self.oeb, self.opts, self.log) self.workaround_sony_quirks() if self.oeb.toc.count() == 0: self.log.warn('This EPUB file has no Table of Contents. ' 'Creating a default TOC') first = next(iter(self.oeb.spine)) self.oeb.toc.add(_('Start'), first.href) from calibre.ebooks.oeb.base import OPF identifiers = oeb.metadata['identifier'] uuid = None for x in identifiers: if x.get(OPF('scheme'), None).lower() == 'uuid' or str(x).startswith('urn:uuid:'): uuid = str(x).split(':')[-1] break encrypted_fonts = getattr(input_plugin, 'encrypted_fonts', []) if uuid is None: self.log.warn('No UUID identifier found') from uuid import uuid4 uuid = str(uuid4()) oeb.metadata.add('identifier', uuid, scheme='uuid', id=uuid) if encrypted_fonts and not uuid.startswith('urn:uuid:'): # Apparently ADE requires this value to start with urn:uuid: # for some absurd reason, or it will throw a hissy fit and refuse # to use the obfuscated fonts. for x in identifiers: if str(x) == uuid: x.content = 'urn:uuid:' + uuid with TemporaryDirectory('_epub_output') as tdir: from calibre.customize.ui import plugin_for_output_format metadata_xml = None extra_entries = [] if self.is_periodical: if self.opts.output_profile.epub_periodical_format == 'sony': from calibre.ebooks.epub.periodical import sony_metadata metadata_xml, atom_xml = sony_metadata(oeb) extra_entries = [('atom.xml', 'application/atom+xml', atom_xml)] oeb_output = plugin_for_output_format('oeb') oeb_output.convert(oeb, tdir, input_plugin, opts, log) opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0] self.condense_ncx([ os.path.join(tdir, x) for x in os.listdir(tdir) if x.endswith('.ncx') ][0]) encryption = None if encrypted_fonts: encryption = self.encrypt_fonts(encrypted_fonts, tdir, uuid) from calibre.ebooks.epub import initialize_container with initialize_container(output_path, os.path.basename(opf), extra_entries=extra_entries) as epub: epub.add_dir(tdir) if encryption is not None: epub.writestr('META-INF/encryption.xml', encryption) if metadata_xml is not None: epub.writestr('META-INF/metadata.xml', metadata_xml.encode('utf-8')) if opts.extract_to is not None: from calibre.utils.zipfile import ZipFile if os.path.exists(opts.extract_to): if os.path.isdir(opts.extract_to): shutil.rmtree(opts.extract_to) else: os.remove(opts.extract_to) os.mkdir(opts.extract_to) with ZipFile(output_path) as zf: zf.extractall(path=opts.extract_to) self.log.info('EPUB extracted to', opts.extract_to) def encrypt_fonts(self, uris, tdir, uuid): # {{{ from binascii import unhexlify key = re.sub(r'[^a-fA-F0-9]', '', uuid) if len(key) < 16: raise ValueError('UUID identifier %r is invalid' % uuid) key = unhexlify((key + key)[:32]) key = tuple(map(ord, key)) paths = [] with CurrentDir(tdir): paths = [os.path.join(*x.split('/')) for x in uris] uris = dict(list(zip(uris, paths))) fonts = [] for uri in list(uris.keys()): path = uris[uri] if isinstance(path, str): path = path.encode(filesystem_encoding) if not os.path.exists(path): uris.pop(uri) continue self.log.debug('Encrypting font:', uri) with open(path, 'r+b') as f: data = f.read(1024) if len(data) >= 1024: f.seek(0) for i in range(1024): f.write(chr(ord(data[i]) ^ key[i % 16])) else: self.log.warn('Font', path, 'is invalid, ignoring') if not isinstance(uri, str): uri = uri.decode('utf-8') fonts.append(''' <enc:EncryptedData> <enc:EncryptionMethod Algorithm="http://ns.adobe.com/pdf/enc#RC"/> <enc:CipherData> <enc:CipherReference URI="%s"/> </enc:CipherData> </enc:EncryptedData> ''' % (uri.replace('"', '\\"'))) if fonts: ans = '''<encryption xmlns="urn:oasis:names:tc:opendocument:xmlns:container" xmlns:enc="http://www.w3.org/2001/04/xmlenc#" xmlns:deenc="http://ns.adobe.com/digitaleditions/enc"> ''' ans += ('\n'.join(fonts)).encode('utf-8') ans += '\n</encryption>' return ans # }}} def condense_ncx(self, ncx_path): from lxml import etree if not self.opts.pretty_print: tree = etree.parse(ncx_path) for tag in tree.getroot().iter(tag=etree.Element): if tag.text: tag.text = tag.text.strip() if tag.tail: tag.tail = tag.tail.strip() compressed = etree.tostring(tree.getroot(), encoding='utf-8') open(ncx_path, 'wb').write(compressed) def workaround_ade_quirks(self): # {{{ ''' Perform various markup transforms to get the output to render correctly in the quirky ADE. ''' from calibre.ebooks.oeb.base import XPath, XHTML, barename, urlunquote stylesheet = self.oeb.manifest.main_stylesheet # ADE cries big wet tears when it encounters an invalid fragment # identifier in the NCX toc. frag_pat = re.compile(r'[-A-Za-z0-9_:.]+$') for node in self.oeb.toc.iter(): href = getattr(node, 'href', None) if hasattr(href, 'partition'): base, _, frag = href.partition('#') frag = urlunquote(frag) if frag and frag_pat.match(frag) is None: self.log.warn( 'Removing fragment identifier %r from TOC as Adobe Digital Editions cannot handle it' % frag) node.href = base for x in self.oeb.spine: root = x.data body = XPath('//h:body')(root) if body: body = body[0] if hasattr(body, 'xpath'): # remove <img> tags with empty src elements bad = [] for x in XPath('//h:img')(body): src = x.get('src', '').strip() if src in ('', '#') or src.startswith('http:'): bad.append(x) for img in bad: img.getparent().remove(img) # Add id attribute to <a> tags that have name for x in XPath('//h:a[@name]')(body): if not x.get('id', False): x.set('id', x.get('name')) # The delightful epubcheck has started complaining about <a> tags that # have name attributes. x.attrib.pop('name') # Replace <br> that are children of <body> as ADE doesn't handle them for br in XPath('./h:br')(body): if br.getparent() is None: continue try: prior = next(br.itersiblings(preceding=True)) priortag = barename(prior.tag) priortext = prior.tail except: priortag = 'body' priortext = body.text if priortext: priortext = priortext.strip() br.tag = XHTML('p') br.text = '\u00a0' style = br.get('style', '').split(';') style = [_f for _f in [x.strip() for x in style] if _f] style.append('margin:0pt; border:0pt') # If the prior tag is a block (including a <br> we replaced) # then this <br> replacement should have a 1-line height. # Otherwise it should have no height. if not priortext and priortag in block_level_tags: style.append('height:1em') else: style.append('height:0pt') br.set('style', '; '.join(style)) for tag in XPath('//h:embed')(root): tag.getparent().remove(tag) for tag in XPath('//h:object')(root): if tag.get('type', '').lower().strip() in { 'image/svg+xml', 'application/svg+xml' }: continue tag.getparent().remove(tag) for tag in XPath('//h:title|//h:style')(root): if not tag.text: tag.getparent().remove(tag) for tag in XPath('//h:script')(root): if (not tag.text and not tag.get('src', False) and tag.get('type', None) != 'text/x-mathjax-config'): tag.getparent().remove(tag) for tag in XPath('//h:body/descendant::h:script')(root): tag.getparent().remove(tag) formchildren = XPath('./h:input|./h:button|./h:textarea|' './h:label|./h:fieldset|./h:legend') for tag in XPath('//h:form')(root): if formchildren(tag): tag.getparent().remove(tag) else: # Not a real form tag.tag = XHTML('div') for tag in XPath('//h:center')(root): tag.tag = XHTML('div') tag.set('style', 'text-align:center') # ADE can't handle & in an img url for tag in XPath('//h:img[@src]')(root): tag.set('src', tag.get('src', '').replace('&', '')) # ADE whimpers in fright when it encounters a <td> outside a # <table> in_table = XPath('ancestor::h:table') for tag in XPath('//h:td|//h:tr|//h:th')(root): if not in_table(tag): tag.tag = XHTML('div') # ADE fails to render non breaking hyphens/soft hyphens/zero width spaces special_chars = re.compile('[\u200b\u00ad]') for elem in root.iterdescendants('*'): if elem.text: elem.text = special_chars.sub('', elem.text) elem.text = elem.text.replace('\u2011', '-') if elem.tail: elem.tail = special_chars.sub('', elem.tail) elem.tail = elem.tail.replace('\u2011', '-') if stylesheet is not None: # ADE doesn't render lists correctly if they have left margins from cssutils.css import CSSRule for lb in XPath('//h:ul[@class]|//h:ol[@class]')(root): sel = '.' + lb.get('class') for rule in stylesheet.data.cssRules.rulesOfType( CSSRule.STYLE_RULE): if sel == rule.selectorList.selectorText: rule.style.removeProperty('margin-left') # padding-left breaks rendering in webkit and gecko rule.style.removeProperty('padding-left') # Change whitespace:pre to pre-wrap to accommodate readers that # cannot scroll horizontally for rule in stylesheet.data.cssRules.rulesOfType( CSSRule.STYLE_RULE): style = rule.style ws = style.getPropertyValue('white-space') if ws == 'pre': style.setProperty('white-space', 'pre-wrap') # }}} def workaround_sony_quirks(self): # {{{ ''' Perform toc link transforms to alleviate slow loading. ''' from calibre.ebooks.oeb.base import urldefrag, XPath from calibre.ebooks.oeb.polish.toc import item_at_top def frag_is_at_top(root, frag): elem = XPath('//*[@id="%s" or @name="%s"]' % (frag, frag))(root) if elem: elem = elem[0] else: return False return item_at_top(elem) def simplify_toc_entry(toc): if toc.href: href, frag = urldefrag(toc.href) if frag: for x in self.oeb.spine: if x.href == href: if frag_is_at_top(x.data, frag): self.log.debug( 'Removing anchor from TOC href:', href + '#' + frag) toc.href = href break for x in toc: simplify_toc_entry(x) if self.oeb.toc: simplify_toc_entry(self.oeb.toc)
class FB2Input(InputFormatPlugin): name = 'FB2 Input' author = 'Anatoly Shipitsin' description = 'Convert FB2 files to HTML' file_types = set(['fb2']) recommendations = set([ ('level1_toc', '//h:h1', OptionRecommendation.MED), ('level2_toc', '//h:h2', OptionRecommendation.MED), ('level3_toc', '//h:h3', OptionRecommendation.MED), ]) options = set([ OptionRecommendation( name='no_inline_fb2_toc', recommended_value=False, level=OptionRecommendation.LOW, help= _('Do not insert a Table of Contents at the beginning of the book.' )), ]) def convert(self, stream, options, file_ext, log, accelerators): from lxml import etree from calibre.ebooks.metadata.fb2 import ensure_namespace from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.meta import get_metadata from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS, RECOVER_PARSER from calibre.ebooks.chardet import xml_to_unicode self.log = log log.debug('Parsing XML...') raw = stream.read().replace('\0', '') raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True, resolve_entities=True)[0] try: doc = etree.fromstring(raw) except etree.XMLSyntaxError: try: doc = etree.fromstring(raw, parser=RECOVER_PARSER) if doc is None: raise Exception('parse failed') except: doc = etree.fromstring(raw.replace('& ', '&'), parser=RECOVER_PARSER) if doc is None: raise ValueError('The FB2 file is not valid XML') doc = ensure_namespace(doc) try: fb_ns = doc.nsmap[doc.prefix] except Exception: fb_ns = FB2NS NAMESPACES = {'f': fb_ns, 'l': XLINK_NS} stylesheets = doc.xpath( '//*[local-name() = "stylesheet" and @type="text/css"]') css = '' for s in stylesheets: css += etree.tostring( s, encoding=str, method='text', with_tail=False) + '\n\n' if css: import cssutils, logging parser = cssutils.CSSParser(fetcher=None, log=logging.getLogger('calibre.css')) XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % XHTML_NS text = XHTML_CSS_NAMESPACE + css log.debug('Parsing stylesheet...') stylesheet = parser.parseString(text) stylesheet.namespaces['h'] = XHTML_NS css = str(stylesheet.cssText).replace('h|style', 'h|span') css = re.sub(r'name\s*=\s*', 'class=', css) self.extract_embedded_content(doc) log.debug('Converting XML to HTML...') ss = open(P('templates/fb2.xsl'), 'rb').read() ss = ss.replace("__FB_NS__", fb_ns) if options.no_inline_fb2_toc: log('Disabling generation of inline FB2 TOC') ss = re.compile(r'<!-- BUILD TOC -->.*<!-- END BUILD TOC -->', re.DOTALL).sub('', ss) styledoc = etree.fromstring(ss) transform = etree.XSLT(styledoc) result = transform(doc) # Handle links of type note and cite notes = { a.get('href')[1:]: a for a in result.xpath('//a[@link_note and @href]') if a.get('href').startswith('#') } cites = { a.get('link_cite'): a for a in result.xpath('//a[@link_cite]') if not a.get('href', '') } all_ids = {x for x in result.xpath('//*/@id')} for cite, a in cites.items(): note = notes.get(cite, None) if note: c = 1 while 'cite%d' % c in all_ids: c += 1 if not note.get('id', None): note.set('id', 'cite%d' % c) all_ids.add(note.get('id')) a.set('href', '#%s' % note.get('id')) for x in result.xpath('//*[@link_note or @link_cite]'): x.attrib.pop('link_note', None) x.attrib.pop('link_cite', None) for img in result.xpath('//img[@src]'): src = img.get('src') img.set('src', self.binary_map.get(src, src)) index = transform.tostring(result) open('index.xhtml', 'wb').write(index) open('inline-styles.css', 'wb').write(css) stream.seek(0) mi = get_metadata(stream, 'fb2') if not mi.title: mi.title = _('Unknown') if not mi.authors: mi.authors = [_('Unknown')] cpath = None if mi.cover_data and mi.cover_data[1]: with open('fb2_cover_calibre_mi.jpg', 'wb') as f: f.write(mi.cover_data[1]) cpath = os.path.abspath('fb2_cover_calibre_mi.jpg') else: for img in doc.xpath('//f:coverpage/f:image', namespaces=NAMESPACES): href = img.get('{%s}href' % XLINK_NS, img.get('href', None)) if href is not None: if href.startswith('#'): href = href[1:] cpath = os.path.abspath(href) break opf = OPFCreator(os.getcwd(), mi) entries = [(f2, guess_type(f2)[0]) for f2 in os.listdir('.')] opf.create_manifest(entries) opf.create_spine(['index.xhtml']) if cpath: opf.guide.set_cover(cpath) with open('metadata.opf', 'wb') as f: opf.render(f) return os.path.join(os.getcwd(), 'metadata.opf') def extract_embedded_content(self, doc): from calibre.ebooks.fb2 import base64_decode self.binary_map = {} for elem in doc.xpath('./*'): if elem.text and 'binary' in elem.tag and 'id' in elem.attrib: ct = elem.get('content-type', '') fname = elem.attrib['id'] ext = ct.rpartition('/')[-1].lower() if ext in ('png', 'jpeg', 'jpg'): if fname.lower().rpartition('.')[-1] not in { 'jpg', 'jpeg', 'png' }: fname += '.' + ext self.binary_map[elem.get('id')] = fname raw = elem.text.strip() try: data = base64_decode(raw) except TypeError: self.log.exception( 'Binary data with id=%s is corrupted, ignoring' % (elem.get('id'))) else: with open(fname, 'wb') as f: f.write(data)
class PDFOutput(OutputFormatPlugin): name = 'PDF Output' author = 'Kovid Goyal' file_type = 'pdf' options = set([ OptionRecommendation( name='override_profile_size', recommended_value=False, help=_( 'Normally, the PDF page size is set by the output profile' ' chosen under the page setup options. This option will cause the ' ' page size settings under PDF Output to override the ' ' size specified by the output profile.')), OptionRecommendation( name='unit', recommended_value='inch', level=OptionRecommendation.LOW, short_switch='u', choices=UNITS, help=_( 'The unit of measure for page sizes. Default is inch. Choices ' 'are %s ' 'Note: This does not override the unit for margins!') % UNITS), OptionRecommendation( name='paper_size', recommended_value='letter', level=OptionRecommendation.LOW, choices=PAPER_SIZES, help= _('The size of the paper. This size will be overridden when a ' 'non default output profile is used. Default is letter. Choices ' 'are %s') % PAPER_SIZES), OptionRecommendation( name='custom_size', recommended_value=None, help=_('Custom size of the document. Use the form widthxheight ' 'e.g. `123x321` to specify the width and height. ' 'This overrides any specified paper-size.')), OptionRecommendation( name='preserve_cover_aspect_ratio', recommended_value=False, help=_('Preserve the aspect ratio of the cover, instead' ' of stretching it to fill the full first page of the' ' generated pdf.')), OptionRecommendation( name='pdf_serif_family', recommended_value='Liberation Serif' if islinux else 'Times New Roman', help=_('The font family used to render serif fonts')), OptionRecommendation( name='pdf_sans_family', recommended_value='Liberation Sans' if islinux else 'Helvetica', help=_('The font family used to render sans-serif fonts')), OptionRecommendation( name='pdf_mono_family', recommended_value='Liberation Mono' if islinux else 'Courier New', help=_('The font family used to render monospaced fonts')), OptionRecommendation( name='pdf_standard_font', choices=['serif', 'sans', 'mono'], recommended_value='serif', help=_('The font family used to render monospaced fonts')), OptionRecommendation(name='pdf_default_font_size', recommended_value=20, help=_('The default font size')), OptionRecommendation( name='pdf_mono_font_size', recommended_value=16, help=_('The default font size for monospaced text')), OptionRecommendation( name='pdf_mark_links', recommended_value=False, help=_( 'Surround all links with a red box, useful for debugging.')), OptionRecommendation( name='old_pdf_engine', recommended_value=False, help=_('Use the old, less capable engine to generate the PDF')), OptionRecommendation( name='uncompressed_pdf', recommended_value=False, help=_('Generate an uncompressed PDF, useful for debugging, ' 'only works with the new PDF engine.')), OptionRecommendation( name='pdf_page_numbers', recommended_value=False, help=_( 'Add page numbers to the bottom of every page in the generated PDF file. If you ' 'specify a footer template, it will take precedence ' 'over this option.')), OptionRecommendation( name='pdf_footer_template', recommended_value=None, help= _('An HTML template used to generate %s on every page.' ' The strings _PAGENUM_, _TITLE_, _AUTHOR_ and _SECTION_ will be replaced by their current values.' ) % _('footers')), OptionRecommendation( name='pdf_header_template', recommended_value=None, help= _('An HTML template used to generate %s on every page.' ' The strings _PAGENUM_, _TITLE_, _AUTHOR_ and _SECTION_ will be replaced by their current values.' ) % _('headers')), OptionRecommendation( name='pdf_add_toc', recommended_value=False, help= _('Add a Table of Contents at the end of the PDF that lists page numbers. ' 'Useful if you want to print out the PDF. If this PDF is intended for electronic use, use the PDF Outline instead.' )), OptionRecommendation(name='toc_title', recommended_value=None, help=_('Title for generated table of contents.')), ]) def convert(self, oeb_book, output_path, input_plugin, opts, log): from calibre.gui2 import must_use_qt, load_builtin_fonts must_use_qt() load_builtin_fonts() self.oeb = oeb_book self.input_plugin, self.opts, self.log = input_plugin, opts, log self.output_path = output_path from calibre.ebooks.oeb.base import OPF, OPF2_NS from lxml import etree from io import BytesIO package = etree.Element(OPF('package'), attrib={ 'version': '2.0', 'unique-identifier': 'dummy' }, nsmap={None: OPF2_NS}) from calibre.ebooks.metadata.opf2 import OPF self.oeb.metadata.to_opf2(package) self.metadata = OPF(BytesIO( etree.tostring(package))).to_book_metadata() self.cover_data = None if input_plugin.is_image_collection: log.debug('Converting input as an image collection...') self.convert_images(input_plugin.get_images()) else: log.debug('Converting input as a text based book...') self.convert_text(oeb_book) def convert_images(self, images): from calibre.ebooks.pdf.writer import ImagePDFWriter self.write(ImagePDFWriter, images, None) def get_cover_data(self): oeb = self.oeb if (oeb.metadata.cover and unicode(oeb.metadata.cover[0]) in oeb.manifest.ids): cover_id = unicode(oeb.metadata.cover[0]) item = oeb.manifest.ids[cover_id] self.cover_data = item.data def handle_embedded_fonts(self): ''' On windows, Qt uses GDI which does not support OpenType (CFF) fonts, so we need to nuke references to OpenType fonts. Qt's directwrite text backend is not mature. Also make sure all fonts are embeddable. ''' from calibre.ebooks.oeb.base import urlnormalize from calibre.utils.fonts.utils import remove_embed_restriction from PyQt5.Qt import QByteArray, QRawFont font_warnings = set() processed = set() is_cff = {} for item in list(self.oeb.manifest): if not hasattr(item.data, 'cssRules'): continue remove = set() for i, rule in enumerate(item.data.cssRules): if rule.type == rule.FONT_FACE_RULE: try: s = rule.style src = s.getProperty('src').propertyValue[0].uri except: continue path = item.abshref(src) ff = self.oeb.manifest.hrefs.get(urlnormalize(path), None) if ff is None: continue raw = nraw = ff.data if path not in processed: processed.add(path) try: nraw = remove_embed_restriction(raw) except: continue if nraw != raw: ff.data = nraw self.oeb.container.write(path, nraw) if iswindows: if path not in is_cff: f = QRawFont(QByteArray(nraw), 12) is_cff[path] = f.isValid() and len( f.fontTable('head')) == 0 if is_cff[path]: if path not in font_warnings: font_warnings.add(path) self.log.warn( 'CFF OpenType fonts are not supported on windows, ignoring: %s' % path) remove.add(i) for i in sorted(remove, reverse=True): item.data.cssRules.pop(i) def convert_text(self, oeb_book): from calibre.ebooks.metadata.opf2 import OPF if self.opts.old_pdf_engine: from calibre.ebooks.pdf.writer import PDFWriter PDFWriter else: from calibre.ebooks.pdf.render.from_html import PDFWriter self.log.debug('Serializing oeb input to disk for processing...') self.get_cover_data() self.handle_embedded_fonts() with TemporaryDirectory('_pdf_out') as oeb_dir: from calibre.customize.ui import plugin_for_output_format oeb_output = plugin_for_output_format('oeb') oeb_output.convert(oeb_book, oeb_dir, self.input_plugin, self.opts, self.log) opfpath = glob.glob(os.path.join(oeb_dir, '*.opf'))[0] opf = OPF(opfpath, os.path.dirname(opfpath)) self.write(PDFWriter, [s.path for s in opf.spine], getattr(opf, 'toc', None)) def write(self, Writer, items, toc): writer = Writer(self.opts, self.log, cover_data=self.cover_data, toc=toc) writer.report_progress = self.report_progress close = False if not hasattr(self.output_path, 'write'): close = True if not os.path.exists(os.path.dirname( self.output_path)) and os.path.dirname( self.output_path) != '': os.makedirs(os.path.dirname(self.output_path)) out_stream = open(self.output_path, 'wb') else: out_stream = self.output_path out_stream.seek(0) out_stream.truncate() self.log.debug('Rendering pages to PDF...') import time st = time.time() if False: import cProfile cProfile.runctx( 'writer.dump(items, out_stream, PDFMetadata(self.metadata))', globals(), locals(), '/tmp/profile') else: writer.dump(items, out_stream, PDFMetadata(self.metadata)) self.log('Rendered PDF in %g seconds:' % (time.time() - st)) if close: out_stream.close()
class PDFOutput(OutputFormatPlugin): name = 'PDF Output' author = 'Kovid Goyal' file_type = 'pdf' options = set([ OptionRecommendation(name='use_profile_size', recommended_value=False, help=_('Instead of using the paper size specified in the PDF Output options,' ' use a paper size corresponding to the current output profile.' ' Useful if you want to generate a PDF for viewing on a specific device.')), OptionRecommendation(name='unit', recommended_value='inch', level=OptionRecommendation.LOW, short_switch='u', choices=UNITS, help=_('The unit of measure for page sizes. Default is inch. Choices ' 'are %s ' 'Note: This does not override the unit for margins!') % UNITS), OptionRecommendation(name='paper_size', recommended_value='letter', level=OptionRecommendation.LOW, choices=PAPER_SIZES, help=_('The size of the paper. This size will be overridden when a ' 'non default output profile is used. Default is letter. Choices ' 'are %s') % PAPER_SIZES), OptionRecommendation(name='custom_size', recommended_value=None, help=_('Custom size of the document. Use the form widthxheight ' 'e.g. `123x321` to specify the width and height. ' 'This overrides any specified paper-size.')), OptionRecommendation(name='preserve_cover_aspect_ratio', recommended_value=False, help=_('Preserve the aspect ratio of the cover, instead' ' of stretching it to fill the full first page of the' ' generated pdf.')), OptionRecommendation(name='pdf_serif_family', recommended_value='Liberation Serif', help=_( 'The font family used to render serif fonts')), OptionRecommendation(name='pdf_sans_family', recommended_value='Liberation Sans', help=_( 'The font family used to render sans-serif fonts')), OptionRecommendation(name='pdf_mono_family', recommended_value='Liberation Mono', help=_( 'The font family used to render monospace fonts')), OptionRecommendation(name='pdf_standard_font', choices=['serif', 'sans', 'mono'], recommended_value='serif', help=_( 'The font family used to render monospace fonts')), OptionRecommendation(name='pdf_default_font_size', recommended_value=20, help=_( 'The default font size')), OptionRecommendation(name='pdf_mono_font_size', recommended_value=16, help=_( 'The default font size for monospaced text')), OptionRecommendation(name='pdf_hyphenate', recommended_value=False, help=_('Break long words at the end of lines. This can give the text at the right margin a more even appearance.')), OptionRecommendation(name='pdf_mark_links', recommended_value=False, help=_('Surround all links with a red box, useful for debugging.')), OptionRecommendation(name='uncompressed_pdf', recommended_value=False, help=_( 'Generate an uncompressed PDF, useful for debugging, ' 'only works with the new PDF engine.')), OptionRecommendation(name='pdf_page_numbers', recommended_value=False, help=_('Add page numbers to the bottom of every page in the generated PDF file. If you ' 'specify a footer template, it will take precedence ' 'over this option.')), OptionRecommendation(name='pdf_footer_template', recommended_value=None, help=_('An HTML template used to generate %s on every page.' ' The strings _PAGENUM_, _TITLE_, _AUTHOR_ and _SECTION_ will be replaced by their current values.')%_('footers')), OptionRecommendation(name='pdf_header_template', recommended_value=None, help=_('An HTML template used to generate %s on every page.' ' The strings _PAGENUM_, _TITLE_, _AUTHOR_ and _SECTION_ will be replaced by their current values.')%_('headers')), OptionRecommendation(name='pdf_add_toc', recommended_value=False, help=_('Add a Table of Contents at the end of the PDF that lists page numbers. ' 'Useful if you want to print out the PDF. If this PDF is intended for electronic use, use the PDF Outline instead.')), OptionRecommendation(name='toc_title', recommended_value=None, help=_('Title for generated table of contents.') ), OptionRecommendation(name='pdf_page_margin_left', recommended_value=72.0, level=OptionRecommendation.LOW, help=_('The size of the left page margin, in pts. Default is 72pt.' ' Overrides the common left page margin setting.') ), OptionRecommendation(name='pdf_page_margin_top', recommended_value=72.0, level=OptionRecommendation.LOW, help=_('The size of the top page margin, in pts. Default is 72pt.' ' Overrides the common top page margin setting, unless set to zero.') ), OptionRecommendation(name='pdf_page_margin_right', recommended_value=72.0, level=OptionRecommendation.LOW, help=_('The size of the right page margin, in pts. Default is 72pt.' ' Overrides the common right page margin setting, unless set to zero.') ), OptionRecommendation(name='pdf_page_margin_bottom', recommended_value=72.0, level=OptionRecommendation.LOW, help=_('The size of the bottom page margin, in pts. Default is 72pt.' ' Overrides the common bottom page margin setting, unless set to zero.') ), OptionRecommendation(name='pdf_use_document_margins', recommended_value=False, help=_('Use the page margins specified in the input document via @page CSS rules.' ' This will cause the margins specified in the conversion settings to be ignored.' ' If the document does not specify page margins, the conversion settings will be used as a fallback.') ), ]) def specialize_options(self, log, opts, input_fmt): if opts.pdf_use_document_margins: # Prevent the conversion pipeline from overwriting document margins opts.margin_left = opts.margin_right = opts.margin_top = opts.margin_bottom = -1 def convert(self, oeb_book, output_path, input_plugin, opts, log): from calibre.gui2 import must_use_qt, load_builtin_fonts from calibre.ebooks.oeb.transforms.split import Split # Turn off hinting in WebKit (requires a patched build of QtWebKit) os.environ['CALIBRE_WEBKIT_NO_HINTING'] = '1' self.filtered_font_warnings = set() self.stored_page_margins = getattr(opts, '_stored_page_margins', {}) try: # split on page breaks, as the JS code to convert page breaks to # column breaks will not work because of QWebSettings.LocalContentCanAccessFileUrls Split()(oeb_book, opts) must_use_qt() load_builtin_fonts() self.oeb = oeb_book self.input_plugin, self.opts, self.log = input_plugin, opts, log self.output_path = output_path from calibre.ebooks.oeb.base import OPF, OPF2_NS from lxml import etree from io import BytesIO package = etree.Element(OPF('package'), attrib={'version': '2.0', 'unique-identifier': 'dummy'}, nsmap={None: OPF2_NS}) from calibre.ebooks.metadata.opf2 import OPF self.oeb.metadata.to_opf2(package) self.metadata = OPF(BytesIO(etree.tostring(package))).to_book_metadata() self.cover_data = None if input_plugin.is_image_collection: log.debug('Converting input as an image collection...') self.convert_images(input_plugin.get_images()) else: log.debug('Converting input as a text based book...') self.convert_text(oeb_book) finally: os.environ.pop('CALIBRE_WEBKIT_NO_HINTING', None) def convert_images(self, images): from calibre.ebooks.pdf.render.from_html import ImagePDFWriter self.write(ImagePDFWriter, images, None) def get_cover_data(self): oeb = self.oeb if (oeb.metadata.cover and unicode(oeb.metadata.cover[0]) in oeb.manifest.ids): cover_id = unicode(oeb.metadata.cover[0]) item = oeb.manifest.ids[cover_id] self.cover_data = item.data def process_fonts(self): ''' Make sure all fonts are embeddable. Also remove some fonts that cause problems. ''' from calibre.ebooks.oeb.base import urlnormalize from calibre.utils.fonts.utils import remove_embed_restriction processed = set() for item in list(self.oeb.manifest): if not hasattr(item.data, 'cssRules'): continue for i, rule in enumerate(item.data.cssRules): if rule.type == rule.FONT_FACE_RULE: try: s = rule.style src = s.getProperty('src').propertyValue[0].uri except: continue path = item.abshref(src) ff = self.oeb.manifest.hrefs.get(urlnormalize(path), None) if ff is None: continue raw = nraw = ff.data if path not in processed: processed.add(path) try: nraw = remove_embed_restriction(raw) except: continue if nraw != raw: ff.data = nraw self.oeb.container.write(path, nraw) elif iswindows and rule.type == rule.STYLE_RULE: from tinycss.fonts3 import parse_font_family, serialize_font_family s = rule.style f = s.getProperty(u'font-family') if f is not None: font_families = parse_font_family(f.propertyValue.cssText) ff = [x for x in font_families if x.lower() != u'courier'] if len(ff) != len(font_families): if 'courier' not in self.filtered_font_warnings: # See https://bugs.launchpad.net/bugs/1665835 self.filtered_font_warnings.add(u'courier') self.log.warn(u'Removing courier font family as it does not render on windows') f.propertyValue.cssText = serialize_font_family(ff or [u'monospace']) def convert_text(self, oeb_book): from calibre.ebooks.metadata.opf2 import OPF from calibre.ebooks.pdf.render.from_html import PDFWriter self.log.debug('Serializing oeb input to disk for processing...') self.get_cover_data() self.process_fonts() if self.opts.pdf_use_document_margins and self.stored_page_margins: import json for href, margins in self.stored_page_margins.iteritems(): item = oeb_book.manifest.hrefs.get(href) root = item.data if hasattr(root, 'xpath') and margins: root.set('data-calibre-pdf-output-page-margins', json.dumps(margins)) with TemporaryDirectory('_pdf_out') as oeb_dir: from calibre.customize.ui import plugin_for_output_format oeb_output = plugin_for_output_format('oeb') oeb_output.convert(oeb_book, oeb_dir, self.input_plugin, self.opts, self.log) opfpath = glob.glob(os.path.join(oeb_dir, '*.opf'))[0] opf = OPF(opfpath, os.path.dirname(opfpath)) self.write(PDFWriter, [s.path for s in opf.spine], getattr(opf, 'toc', None)) def write(self, Writer, items, toc): writer = Writer(self.opts, self.log, cover_data=self.cover_data, toc=toc) writer.report_progress = self.report_progress close = False if not hasattr(self.output_path, 'write'): close = True if not os.path.exists(os.path.dirname(self.output_path)) and os.path.dirname(self.output_path) != '': os.makedirs(os.path.dirname(self.output_path)) out_stream = open(self.output_path, 'wb') else: out_stream = self.output_path out_stream.seek(0) out_stream.truncate() self.log.debug('Rendering pages to PDF...') import time st = time.time() if False: import cProfile cProfile.runctx('writer.dump(items, out_stream, PDFMetadata(self.metadata))', globals(), locals(), '/tmp/profile') else: writer.dump(items, out_stream, PDFMetadata(self.metadata)) self.log('Rendered PDF in %g seconds:'%(time.time()-st)) if close: out_stream.close()
class RTFInput(InputFormatPlugin): name = 'RTF Input' author = 'Kovid Goyal' description = _('Convert RTF files to HTML') file_types = {'rtf'} commit_name = 'rtf_input' options = { OptionRecommendation( name='ignore_wmf', recommended_value=False, help= _('Ignore WMF images instead of replacing them with a placeholder image.' )), } def generate_xml(self, stream): from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf ofile = 'dataxml.xml' run_lev, debug_dir, indent_out = 1, None, 0 if getattr(self.opts, 'debug_pipeline', None) is not None: try: os.mkdir('rtfdebug') debug_dir = 'rtfdebug' run_lev = 4 indent_out = 1 self.log('Running RTFParser in debug mode') except: self.log.warn('Impossible to run RTFParser in debug mode') parser = ParseRtf( in_file=stream, out_file=ofile, # Convert symbol fonts to unicode equivalents. Default # is 1 convert_symbol=1, # Convert Zapf fonts to unicode equivalents. Default # is 1. convert_zapf=1, # Convert Wingding fonts to unicode equivalents. # Default is 1. convert_wingdings=1, # Convert RTF caps to real caps. # Default is 1. convert_caps=1, # Indent resulting XML. # Default is 0 (no indent). indent=indent_out, # Form lists from RTF. Default is 1. form_lists=1, # Convert headings to sections. Default is 0. headings_to_sections=1, # Group paragraphs with the same style name. Default is 1. group_styles=1, # Group borders. Default is 1. group_borders=1, # Write or do not write paragraphs. Default is 0. empty_paragraphs=1, # Debug deb_dir=debug_dir, # Default encoding default_encoding=getattr(self.opts, 'input_encoding', 'cp1252') or 'cp1252', # Run level run_level=run_lev, ) parser.parse_rtf() with open(ofile, 'rb') as f: return f.read() def extract_images(self, picts): from calibre.utils.imghdr import what from binascii import unhexlify self.log('Extracting images...') with open(picts, 'rb') as f: raw = f.read() picts = filter(len, re.findall(br'\{\\pict([^}]+)\}', raw)) hex_pat = re.compile(br'[^a-fA-F0-9]') encs = [hex_pat.sub(b'', pict) for pict in picts] count = 0 imap = {} for enc in encs: if len(enc) % 2 == 1: enc = enc[:-1] data = unhexlify(enc) fmt = what(None, data) if fmt is None: fmt = 'wmf' count += 1 name = '%04d.%s' % (count, fmt) with open(name, 'wb') as f: f.write(data) imap[count] = name # with open(name+'.hex', 'wb') as f: # f.write(enc) return self.convert_images(imap) def convert_images(self, imap): self.default_img = None for count, val in iteritems(imap): try: imap[count] = self.convert_image(val) except: self.log.exception('Failed to convert', val) return imap def convert_image(self, name): if not name.endswith('.wmf'): return name try: return self.rasterize_wmf(name) except Exception: self.log.exception('Failed to convert WMF image %r' % name) return self.replace_wmf(name) def replace_wmf(self, name): if self.opts.ignore_wmf: os.remove(name) return '__REMOVE_ME__' from calibre.ebooks.covers import message_image if self.default_img is None: self.default_img = message_image( 'Conversion of WMF images is not supported.' ' Use Microsoft Word or OpenOffice to save this RTF file' ' as HTML and convert that in calibre.') name = name.replace('.wmf', '.jpg') with lopen(name, 'wb') as f: f.write(self.default_img) return name def rasterize_wmf(self, name): from calibre.utils.wmf.parse import wmf_unwrap with open(name, 'rb') as f: data = f.read() data = wmf_unwrap(data) name = name.replace('.wmf', '.png') with open(name, 'wb') as f: f.write(data) return name def write_inline_css(self, ic, border_styles): font_size_classes = [ 'span.fs%d { font-size: %spt }' % (i, x) for i, x in enumerate(ic.font_sizes) ] color_classes = [ 'span.col%d { color: %s }' % (i, x) for i, x in enumerate(ic.colors) if x != 'false' ] css = textwrap.dedent(''' span.none { text-decoration: none; font-weight: normal; font-style: normal; font-variant: normal } span.italics { font-style: italic } span.bold { font-weight: bold } span.small-caps { font-variant: small-caps } span.underlined { text-decoration: underline } span.strike-through { text-decoration: line-through } ''') css += '\n' + '\n'.join(font_size_classes) css += '\n' + '\n'.join(color_classes) for cls, val in iteritems(border_styles): css += '\n\n.%s {\n%s\n}' % (cls, val) with open('styles.css', 'ab') as f: f.write(css.encode('utf-8')) def convert_borders(self, doc): border_styles = [] style_map = {} for elem in doc.xpath(r'//*[local-name()="cell"]'): style = [ 'border-style: hidden', 'border-width: 1px', 'border-color: black' ] for x in ('bottom', 'top', 'left', 'right'): bs = elem.get('border-cell-%s-style' % x, None) if bs: cbs = border_style_map.get(bs, 'solid') style.append('border-%s-style: %s' % (x, cbs)) bw = elem.get('border-cell-%s-line-width' % x, None) if bw: style.append('border-%s-width: %spt' % (x, bw)) bc = elem.get('border-cell-%s-color' % x, None) if bc: style.append('border-%s-color: %s' % (x, bc)) style = ';\n'.join(style) if style not in border_styles: border_styles.append(style) idx = border_styles.index(style) cls = 'border_style%d' % idx style_map[cls] = style elem.set('class', cls) return style_map def convert(self, stream, options, file_ext, log, accelerators): from lxml import etree from calibre.ebooks.metadata.meta import get_metadata from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException from calibre.ebooks.rtf.input import InlineClass from calibre.utils.xml_parse import safe_xml_fromstring self.opts = options self.log = log self.log('Converting RTF to XML...') try: xml = self.generate_xml(stream.name) except RtfInvalidCodeException as e: self.log.exception('Unable to parse RTF') raise ValueError( _('This RTF file has a feature calibre does not ' 'support. Convert it to HTML first and then try it.\n%s') % e) d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf')) if d: imap = {} try: imap = self.extract_images(d[0]) except: self.log.exception('Failed to extract images...') self.log('Parsing XML...') doc = safe_xml_fromstring(xml) border_styles = self.convert_borders(doc) for pict in doc.xpath( '//rtf:pict[@num]', namespaces={'rtf': 'http://rtf2xml.sourceforge.net/'}): num = int(pict.get('num')) name = imap.get(num, None) if name is not None: pict.set('num', name) self.log('Converting XML to HTML...') inline_class = InlineClass(self.log) styledoc = safe_xml_fromstring(P('templates/rtf.xsl', data=True), recover=False) extensions = {('calibre', 'inline-class'): inline_class} transform = etree.XSLT(styledoc, extensions=extensions) result = transform(doc) html = 'index.xhtml' with open(html, 'wb') as f: res = as_bytes(transform.tostring(result)) # res = res[:100].replace('xmlns:html', 'xmlns') + res[100:] # clean multiple \n res = re.sub(b'\n+', b'\n', res) # Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines # res = re.sub('\s*<body>', '<body>', res) # res = re.sub('(?<=\n)\n{2}', # u'<p>\u00a0</p>\n'.encode('utf-8'), res) f.write(res) self.write_inline_css(inline_class, border_styles) stream.seek(0) mi = get_metadata(stream, 'rtf') if not mi.title: mi.title = _('Unknown') if not mi.authors: mi.authors = [_('Unknown')] opf = OPFCreator(os.getcwd(), mi) opf.create_manifest([('index.xhtml', None)]) opf.create_spine(['index.xhtml']) opf.render(open('metadata.opf', 'wb')) return os.path.abspath('metadata.opf') def postprocess_book(self, oeb, opts, log): for item in oeb.spine: for img in item.data.xpath( '//*[local-name()="img" and @src="__REMOVE_ME__"]'): p = img.getparent() idx = p.index(img) p.remove(img) if img.tail: if idx == 0: p.text = (p.text or '') + img.tail else: p[idx - 1].tail = (p[idx - 1].tail or '') + img.tail
class ComicInput(InputFormatPlugin): name = 'Comic Input' author = 'Kovid Goyal' description = _( 'Optimize comic files (.cbz, .cbr, .cbc) for viewing on portable devices' ) file_types = {'cbz', 'cbr', 'cb7', 'cbc'} is_image_collection = True commit_name = 'comic_input' core_usage = -1 options = { OptionRecommendation( name='colors', recommended_value=0, help= _('Reduce the number of colors used in the image. This works only' ' if you choose the PNG output format. It is useful to reduce file sizes.' ' Set to zero to turn off. Maximum value is 256. It is off by default.' )), OptionRecommendation( name='dont_normalize', recommended_value=False, help=_('Disable normalize (improve contrast) color range ' 'for pictures. Default: False')), OptionRecommendation( name='keep_aspect_ratio', recommended_value=False, help=_( 'Maintain picture aspect ratio. Default is to fill the screen.' )), OptionRecommendation(name='dont_sharpen', recommended_value=False, help=_('Disable sharpening.')), OptionRecommendation( name='disable_trim', recommended_value=False, help=_('Disable trimming of comic pages. For some comics, ' 'trimming might remove content as well as borders.')), OptionRecommendation( name='landscape', recommended_value=False, help=_("Don't split landscape images into two portrait images")), OptionRecommendation( name='wide', recommended_value=False, help=_("Keep aspect ratio and scale image using screen height as " "image width for viewing in landscape mode.")), OptionRecommendation( name='right2left', recommended_value=False, help=_('Used for right-to-left publications like manga. ' 'Causes landscape pages to be split into portrait pages ' 'from right to left.')), OptionRecommendation(name='despeckle', recommended_value=False, help=_('Enable Despeckle. Reduces speckle noise. ' 'May greatly increase processing time.')), OptionRecommendation( name='no_sort', recommended_value=False, help=_("Don't sort the files found in the comic " "alphabetically by name. Instead use the order they were " "added to the comic.")), OptionRecommendation( name='output_format', choices=['png', 'jpg'], recommended_value='png', help=_( 'The format that images in the created e-book ' 'are converted to. You can experiment to see which format gives ' 'you optimal size and look on your device.')), OptionRecommendation(name='no_process', recommended_value=False, help=_("Apply no processing to the image")), OptionRecommendation( name='dont_grayscale', recommended_value=False, help=_('Do not convert the image to grayscale (black and white)')), OptionRecommendation( name='comic_image_size', recommended_value=None, help=_( 'Specify the image size as widthxheight pixels. Normally,' ' an image size is automatically calculated from the output ' 'profile, this option overrides it.')), OptionRecommendation( name='dont_add_comic_pages_to_toc', recommended_value=False, help=_( 'When converting a CBC do not add links to each page to' ' the TOC. Note this only applies if the TOC has more than one' ' section')), } recommendations = { ('margin_left', 0, OptionRecommendation.HIGH), ('margin_top', 0, OptionRecommendation.HIGH), ('margin_right', 0, OptionRecommendation.HIGH), ('margin_bottom', 0, OptionRecommendation.HIGH), ('insert_blank_line', False, OptionRecommendation.HIGH), ('remove_paragraph_spacing', False, OptionRecommendation.HIGH), ('change_justification', 'left', OptionRecommendation.HIGH), ('dont_split_on_pagebreaks', True, OptionRecommendation.HIGH), ('chapter', None, OptionRecommendation.HIGH), ('page_breaks_brefore', None, OptionRecommendation.HIGH), ('use_auto_toc', False, OptionRecommendation.HIGH), ('page_breaks_before', None, OptionRecommendation.HIGH), ('disable_font_rescaling', True, OptionRecommendation.HIGH), ('linearize_tables', False, OptionRecommendation.HIGH), } def get_comics_from_collection(self, stream): from calibre.libunzip import extract as zipextract tdir = PersistentTemporaryDirectory('_comic_collection') zipextract(stream, tdir) comics = [] with CurrentDir(tdir): if not os.path.exists('comics.txt'): raise ValueError( ('%s is not a valid comic collection' ' no comics.txt was found in the file') % stream.name) with open('comics.txt', 'rb') as f: raw = f.read() if raw.startswith(codecs.BOM_UTF16_BE): raw = raw.decode('utf-16-be')[1:] elif raw.startswith(codecs.BOM_UTF16_LE): raw = raw.decode('utf-16-le')[1:] elif raw.startswith(codecs.BOM_UTF8): raw = raw.decode('utf-8')[1:] else: raw = raw.decode('utf-8') for line in raw.splitlines(): line = line.strip() if not line: continue fname, title = line.partition(':')[0], line.partition(':')[-1] fname = fname.replace('#', '_') fname = os.path.join(tdir, *fname.split('/')) if not title: title = os.path.basename(fname).rpartition('.')[0] if os.access(fname, os.R_OK): comics.append([title, fname]) if not comics: raise ValueError('%s has no comics' % stream.name) return comics def get_pages(self, comic, tdir2): from calibre.ebooks.comic.input import (extract_comic, process_pages, find_pages) tdir = extract_comic(comic) new_pages = find_pages(tdir, sort_on_mtime=self.opts.no_sort, verbose=self.opts.verbose) thumbnail = None if not new_pages: raise ValueError('Could not find any pages in the comic: %s' % comic) if self.opts.no_process: n2 = [] for i, page in enumerate(new_pages): n2.append( os.path.join(tdir2, '{} - {}'.format(i, os.path.basename(page)))) shutil.copyfile(page, n2[-1]) new_pages = n2 else: new_pages, failures = process_pages(new_pages, self.opts, self.report_progress, tdir2) if failures: self.log.warning('Could not process the following pages ' '(run with --verbose to see why):') for f in failures: self.log.warning('\t', f) if not new_pages: raise ValueError( 'Could not find any valid pages in comic: %s' % comic) thumbnail = os.path.join( tdir2, 'thumbnail.' + self.opts.output_format.lower()) if not os.access(thumbnail, os.R_OK): thumbnail = None return new_pages def get_images(self): return self._images def convert(self, stream, opts, file_ext, log, accelerators): from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.toc import TOC self.opts, self.log = opts, log if file_ext == 'cbc': comics_ = self.get_comics_from_collection(stream) else: comics_ = [['Comic', os.path.abspath(stream.name)]] stream.close() comics = [] num_pages_per_comic = [] for i, x in enumerate(comics_): title, fname = x cdir = 'comic_%d' % (i + 1) if len(comics_) > 1 else '.' cdir = os.path.abspath(cdir) if not os.path.exists(cdir): os.makedirs(cdir) pages = self.get_pages(fname, cdir) if not pages: continue num_pages_per_comic.append(len(pages)) if self.for_viewer: comics.append( (title, pages, [self.create_viewer_wrapper(pages, cdir)])) else: wrappers = self.create_wrappers(pages) comics.append((title, pages, wrappers)) if not comics: raise ValueError('No comic pages found in %s' % stream.name) mi = MetaInformation( os.path.basename(stream.name).rpartition('.')[0], [_('Unknown')]) opf = OPFCreator(os.getcwd(), mi) entries = [] def href(x): if len(comics) == 1: return os.path.basename(x) return '/'.join(x.split(os.sep)[-2:]) cover_href = None for comic in comics: pages, wrappers = comic[1:] page_entries = [(x, None) for x in map(href, pages)] entries += [(w, None) for w in map(href, wrappers)] + page_entries if cover_href is None and page_entries: cover_href = page_entries[0][0] opf.create_manifest(entries) spine = [] for comic in comics: spine.extend(map(href, comic[2])) self._images = [] for comic in comics: self._images.extend(comic[1]) opf.create_spine(spine) if self.for_viewer and cover_href: if os.path.isabs(cover_href): cover_href = os.path.relpath(cover_href).replace(os.sep, '/') opf.guide.set_cover(cover_href) toc = TOC() if len(comics) == 1: wrappers = comics[0][2] if self.for_viewer: wrapper_page_href = href(wrappers[0]) for i in range(num_pages_per_comic[0]): toc.add_item('{}#page_{}'.format(wrapper_page_href, i + 1), None, _('Page') + ' %d' % (i + 1), play_order=i) else: for i, x in enumerate(wrappers): toc.add_item(href(x), None, _('Page') + ' %d' % (i + 1), play_order=i) else: po = 0 for num_pages, comic in zip(num_pages_per_comic, comics): po += 1 wrappers = comic[2] stoc = toc.add_item(href(wrappers[0]), None, comic[0], play_order=po) if not opts.dont_add_comic_pages_to_toc: if self.for_viewer: wrapper_page_href = href(wrappers[0]) for i in range(num_pages): stoc.add_item('{}#page_{}'.format( wrapper_page_href, i + 1), None, _('Page') + ' %d' % (i + 1), play_order=po) po += 1 else: for i, x in enumerate(wrappers): stoc.add_item(href(x), None, _('Page') + ' %d' % (i + 1), play_order=po) po += 1 opf.set_toc(toc) with open('metadata.opf', 'wb') as m, open('toc.ncx', 'wb') as n: opf.render(m, n, 'toc.ncx') return os.path.abspath('metadata.opf') def create_wrappers(self, pages): from calibre.ebooks.oeb.base import XHTML_NS wrappers = [] WRAPPER = textwrap.dedent('''\ <html xmlns="%s"> <head> <meta charset="utf-8"/> <title>Page #%d</title> <style type="text/css"> @page { margin:0pt; padding: 0pt} body { margin: 0pt; padding: 0pt} div { text-align: center } </style> </head> <body> <div> <img src="%s" alt="comic page #%d" /> </div> </body> </html> ''') dir = os.path.dirname(pages[0]) for i, page in enumerate(pages): wrapper = WRAPPER % (XHTML_NS, i + 1, os.path.basename(page), i + 1) page = os.path.join(dir, 'page_%d.xhtml' % (i + 1)) with open(page, 'wb') as f: f.write(wrapper.encode('utf-8')) wrappers.append(page) return wrappers def create_viewer_wrapper(self, pages, cdir): from calibre.ebooks.oeb.base import XHTML_NS def page(pnum, src): return '<img id="page_{}" src="{}"></img>'.format( pnum + 1, os.path.basename(src)) pages = '\n'.join(page(i, src) for i, src in enumerate(pages)) base = os.path.dirname(pages[0]) wrapper = ''' <html xmlns="%s"> <head> <meta charset="utf-8"/> <style type="text/css"> html, body, img { height: 100vh; display: block; margin: 0; padding: 0; border-width: 0; } img { width: 100%%; height: 100%%; object-fit: contain; margin-left: auto; margin-right: auto; max-width: 100vw; max-height: 100vh; top: 50vh; transform: translateY(-50%%); position: relative; page-break-after: always; } </style> </head> <body> %s </body> </html> ''' % (XHTML_NS, pages) path = os.path.join(base, cdir, 'wrapper.xhtml') with open(path, 'wb') as f: f.write(wrapper.encode('utf-8')) return path
class SNBOutput(OutputFormatPlugin): name = 'SNB Output' author = 'Li Fanxi' file_type = 'snb' commit_name = 'snb_output' options = { OptionRecommendation( name='snb_output_encoding', recommended_value='utf-8', level=OptionRecommendation.LOW, help=_('Specify the character encoding of the output document. ' 'The default is utf-8.')), OptionRecommendation( name='snb_max_line_length', recommended_value=0, level=OptionRecommendation.LOW, help= _('The maximum number of characters per line. This splits on ' 'the first space before the specified value. If no space is found ' 'the line will be broken at the space after and will exceed the ' 'specified value. Also, there is a minimum of 25 characters. ' 'Use 0 to disable line splitting.')), OptionRecommendation( name='snb_insert_empty_line', recommended_value=False, level=OptionRecommendation.LOW, help=_('Specify whether or not to insert an empty line between ' 'two paragraphs.')), OptionRecommendation( name='snb_dont_indent_first_line', recommended_value=False, level=OptionRecommendation.LOW, help=_('Specify whether or not to insert two space characters ' 'to indent the first line of each paragraph.')), OptionRecommendation( name='snb_hide_chapter_name', recommended_value=False, level=OptionRecommendation.LOW, help=_('Specify whether or not to hide the chapter title for each ' 'chapter. Useful for image-only output (eg. comics).')), OptionRecommendation( name='snb_full_screen', recommended_value=False, level=OptionRecommendation.LOW, help=_('Resize all the images for full screen mode. ')), } def convert(self, oeb_book, output_path, input_plugin, opts, log): from lxml import etree from calibre.ebooks.snb.snbfile import SNBFile from calibre.ebooks.snb.snbml import SNBMLizer, ProcessFileName self.opts = opts from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable try: rasterizer = SVGRasterizer() rasterizer(oeb_book, opts) except Unavailable: log.warn('SVG rasterizer unavailable, SVG will not be converted') # Create temp dir with TemporaryDirectory('_snb_output') as tdir: # Create stub directories snbfDir = os.path.join(tdir, 'snbf') snbcDir = os.path.join(tdir, 'snbc') snbiDir = os.path.join(tdir, 'snbc/images') os.mkdir(snbfDir) os.mkdir(snbcDir) os.mkdir(snbiDir) # Process Meta data meta = oeb_book.metadata if meta.title: title = str(meta.title[0]) else: title = '' authors = [str(x) for x in meta.creator if x.role == 'aut'] if meta.publisher: publishers = str(meta.publisher[0]) else: publishers = '' if meta.language: lang = str(meta.language[0]).upper() else: lang = '' if meta.description: abstract = str(meta.description[0]) else: abstract = '' # Process Cover g, m, s = oeb_book.guide, oeb_book.manifest, oeb_book.spine href = None if 'titlepage' not in g: if 'cover' in g: href = g['cover'].href # Output book info file bookInfoTree = etree.Element("book-snbf", version="1.0") headTree = etree.SubElement(bookInfoTree, "head") etree.SubElement(headTree, "name").text = title etree.SubElement(headTree, "author").text = ' '.join(authors) etree.SubElement(headTree, "language").text = lang etree.SubElement(headTree, "rights") etree.SubElement(headTree, "publisher").text = publishers etree.SubElement( headTree, "generator").text = __appname__ + ' ' + __version__ etree.SubElement(headTree, "created") etree.SubElement(headTree, "abstract").text = abstract if href is not None: etree.SubElement(headTree, "cover").text = ProcessFileName(href) else: etree.SubElement(headTree, "cover") with open(os.path.join(snbfDir, 'book.snbf'), 'wb') as f: f.write( etree.tostring(bookInfoTree, pretty_print=True, encoding='utf-8')) # Output TOC tocInfoTree = etree.Element("toc-snbf") tocHead = etree.SubElement(tocInfoTree, "head") tocBody = etree.SubElement(tocInfoTree, "body") outputFiles = {} if oeb_book.toc.count() == 0: log.warn('This SNB file has no Table of Contents. ' 'Creating a default TOC') first = next(iter(oeb_book.spine)) oeb_book.toc.add(_('Start page'), first.href) else: first = next(iter(oeb_book.spine)) if oeb_book.toc[0].href != first.href: # The pages before the fist item in toc will be stored as # "Cover Pages". # oeb_book.toc does not support "insert", so we generate # the tocInfoTree directly instead of modifying the toc ch = etree.SubElement(tocBody, "chapter") ch.set("src", ProcessFileName(first.href) + ".snbc") ch.text = _('Cover pages') outputFiles[first.href] = [] outputFiles[first.href].append(("", _("Cover pages"))) for tocitem in oeb_book.toc: if tocitem.href.find('#') != -1: item = tocitem.href.split('#') if len(item) != 2: log.error('Error in TOC item: %s' % tocitem) else: if item[0] in outputFiles: outputFiles[item[0]].append( (item[1], tocitem.title)) else: outputFiles[item[0]] = [] if "" not in outputFiles[item[0]]: outputFiles[item[0]].append( ("", tocitem.title + _(" (Preface)"))) ch = etree.SubElement(tocBody, "chapter") ch.set("src", ProcessFileName(item[0]) + ".snbc") ch.text = tocitem.title + _(" (Preface)") outputFiles[item[0]].append( (item[1], tocitem.title)) else: if tocitem.href in outputFiles: outputFiles[tocitem.href].append(("", tocitem.title)) else: outputFiles[tocitem.href] = [] outputFiles[tocitem.href].append(("", tocitem.title)) ch = etree.SubElement(tocBody, "chapter") ch.set("src", ProcessFileName(tocitem.href) + ".snbc") ch.text = tocitem.title etree.SubElement(tocHead, "chapters").text = '%d' % len(tocBody) with open(os.path.join(snbfDir, 'toc.snbf'), 'wb') as f: f.write( etree.tostring(tocInfoTree, pretty_print=True, encoding='utf-8')) # Output Files oldTree = None mergeLast = False lastName = None for item in s: from calibre.ebooks.oeb.base import OEB_DOCS, OEB_IMAGES if m.hrefs[item.href].media_type in OEB_DOCS: if item.href not in outputFiles: log.debug( 'File %s is unused in TOC. Continue in last chapter' % item.href) mergeLast = True else: if oldTree is not None and mergeLast: log.debug('Output the modified chapter again: %s' % lastName) with open(os.path.join(snbcDir, lastName), 'wb') as f: f.write( etree.tostring(oldTree, pretty_print=True, encoding='utf-8')) mergeLast = False log.debug('Converting %s to snbc...' % item.href) snbwriter = SNBMLizer(log) snbcTrees = None if not mergeLast: snbcTrees = snbwriter.extract_content( oeb_book, item, outputFiles[item.href], opts) for subName in snbcTrees: postfix = '' if subName != '': postfix = '_' + subName lastName = ProcessFileName(item.href + postfix + ".snbc") oldTree = snbcTrees[subName] with open(os.path.join(snbcDir, lastName), 'wb') as f: f.write( etree.tostring(oldTree, pretty_print=True, encoding='utf-8')) else: log.debug('Merge %s with last TOC item...' % item.href) snbwriter.merge_content(oldTree, oeb_book, item, [('', _("Start"))], opts) # Output the last one if needed log.debug('Output the last modified chapter again: %s' % lastName) if oldTree is not None and mergeLast: with open(os.path.join(snbcDir, lastName), 'wb') as f: f.write( etree.tostring(oldTree, pretty_print=True, encoding='utf-8')) mergeLast = False for item in m: if m.hrefs[item.href].media_type in OEB_IMAGES: log.debug('Converting image: %s ...' % item.href) content = m.hrefs[item.href].data # Convert & Resize image self.HandleImage( content, os.path.join(snbiDir, ProcessFileName(item.href))) # Package as SNB File snbFile = SNBFile() snbFile.FromDir(tdir) snbFile.Output(output_path) def HandleImage(self, imageData, imagePath): from calibre.utils.img import image_from_data, resize_image, image_to_data img = image_from_data(imageData) x, y = img.width(), img.height() if self.opts: if self.opts.snb_full_screen: SCREEN_X, SCREEN_Y = self.opts.output_profile.screen_size else: SCREEN_X, SCREEN_Y = self.opts.output_profile.comic_screen_size else: SCREEN_X = 540 SCREEN_Y = 700 # Handle big image only if x > SCREEN_X or y > SCREEN_Y: xScale = float(x) / SCREEN_X yScale = float(y) / SCREEN_Y scale = max(xScale, yScale) # TODO : intelligent image rotation # img = img.rotate(90) # x,y = y,x img = resize_image(img, x // scale, y // scale) with lopen(imagePath, 'wb') as f: f.write(image_to_data(img, fmt=imagePath.rpartition('.')[-1]))
class TXTInput(InputFormatPlugin): name = 'TXT Input' author = 'John Schember' description = 'Convert TXT files to HTML' file_types = {'txt', 'txtz', 'text', 'md', 'textile', 'markdown'} commit_name = 'txt_input' ui_data = { 'md_extensions': MD_EXTENSIONS, 'paragraph_types': { 'auto': _('Try to auto detect paragraph type'), 'block': _('Treat a blank line as a paragraph break'), 'single': _('Assume every line is a paragraph'), 'print': _('Assume every line starting with 2+ spaces or a tab starts a paragraph'), 'unformatted': _('Most lines have hard line breaks, few/no blank lines or indents'), 'off': _('Don\'t modify the paragraph structure'), }, 'formatting_types': { 'auto': _('Automatically decide which formatting processor to use'), 'plain': _('No formatting'), 'heuristic': _('Use heuristics to determine chapter headings, italics, etc.'), 'textile': _('Use the TexTile markup language'), 'markdown': _('Use the Markdown markup language') }, } options = { OptionRecommendation(name='formatting_type', recommended_value='auto', choices=list(ui_data['formatting_types']), help=_('Formatting used within the document.\n' '* auto: {auto}\n' '* plain: {plain}\n' '* heuristic: {heuristic}\n' '* textile: {textile}\n' '* markdown: {markdown}\n' 'To learn more about markdown see {url}').format( url='https://daringfireball.net/projects/markdown/', **ui_data['formatting_types']) ), OptionRecommendation(name='paragraph_type', recommended_value='auto', choices=list(ui_data['paragraph_types']), help=_('Paragraph structure to assume. The value of "off" is useful for formatted documents such as Markdown or Textile. ' 'Choices are:\n' '* auto: {auto}\n' '* block: {block}\n' '* single: {single}\n' '* print: {print}\n' '* unformatted: {unformatted}\n' '* off: {off}').format(**ui_data['paragraph_types']) ), OptionRecommendation(name='preserve_spaces', recommended_value=False, help=_('Normally extra spaces are condensed into a single space. ' 'With this option all spaces will be displayed.')), OptionRecommendation(name='txt_in_remove_indents', recommended_value=False, help=_('Normally extra space at the beginning of lines is retained. ' 'With this option they will be removed.')), OptionRecommendation(name="markdown_extensions", recommended_value='footnotes, tables, toc', help=_('Enable extensions to markdown syntax. Extensions are formatting that is not part ' 'of the standard markdown format. The extensions enabled by default: %default.\n' 'To learn more about markdown extensions, see {}\n' 'This should be a comma separated list of extensions to enable:\n' ).format('https://python-markdown.github.io/extensions/') + '\n'.join('* %s: %s' % (k, MD_EXTENSIONS[k]) for k in sorted(MD_EXTENSIONS))), } def shift_file(self, base_dir, fname, data): name, ext = os.path.splitext(fname) c = 1 while os.path.exists(os.path.join(base_dir, '{}-{}{}'.format(name, c, ext))): c += 1 ans = os.path.join(base_dir, '{}-{}{}'.format(name, c, ext)) with open(ans, 'wb') as f: f.write(data) return f.name def fix_resources(self, html, base_dir): from html5_parser import parse root = parse(html) changed = False for img in root.xpath('//img[@src]'): src = img.get('src') prefix = src.split(':', 1)[0].lower() if prefix not in ('file', 'http', 'https', 'ftp') and not os.path.isabs(src): src = os.path.join(base_dir, src) if os.access(src, os.R_OK): with open(src, 'rb') as f: data = f.read() f = self.shift_file(base_dir, os.path.basename(src), data) changed = True img.set('src', os.path.basename(f)) if changed: from lxml import etree html = etree.tostring(root, encoding='unicode') return html def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator from calibre.ebooks.chardet import detect from calibre.utils.zipfile import ZipFile from calibre.ebooks.txt.processor import (convert_basic, convert_markdown_with_metadata, separate_paragraphs_single_line, separate_paragraphs_print_formatted, preserve_spaces, detect_paragraph_type, detect_formatting_type, normalize_line_endings, convert_textile, remove_indents, block_to_single_line, separate_hard_scene_breaks) self.log = log txt = b'' log.debug('Reading text from file...') length = 0 base_dir = getcwd() # Extract content from zip archive. if file_ext == 'txtz': zf = ZipFile(stream) zf.extractall('.') for x in walk('.'): if os.path.splitext(x)[1].lower() in ('.txt', '.text'): with open(x, 'rb') as tf: txt += tf.read() + b'\n\n' else: if getattr(stream, 'name', None): base_dir = os.path.dirname(stream.name) txt = stream.read() if file_ext in {'md', 'textile', 'markdown'}: options.formatting_type = {'md': 'markdown'}.get(file_ext, file_ext) log.info('File extension indicates particular formatting. ' 'Forcing formatting type to: %s'%options.formatting_type) options.paragraph_type = 'off' # Get the encoding of the document. if options.input_encoding: ienc = options.input_encoding log.debug('Using user specified input encoding of %s' % ienc) else: det_encoding = detect(txt[:4096]) det_encoding, confidence = det_encoding['encoding'], det_encoding['confidence'] if det_encoding and det_encoding.lower().replace('_', '-').strip() in ( 'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn', 'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'): # Microsoft Word exports to HTML with encoding incorrectly set to # gb2312 instead of gbk. gbk is a superset of gb2312, anyway. det_encoding = 'gbk' ienc = det_encoding log.debug('Detected input encoding as %s with a confidence of %s%%' % (ienc, confidence * 100)) if not ienc: ienc = 'utf-8' log.debug('No input encoding specified and could not auto detect using %s' % ienc) # Remove BOM from start of txt as its presence can confuse markdown import codecs for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8, codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE): if txt.startswith(bom): txt = txt[len(bom):] break txt = txt.decode(ienc, 'replace') # Replace entities txt = _ent_pat.sub(xml_entity_to_unicode, txt) # Normalize line endings txt = normalize_line_endings(txt) # Determine the paragraph type of the document. if options.paragraph_type == 'auto': options.paragraph_type = detect_paragraph_type(txt) if options.paragraph_type == 'unknown': log.debug('Could not reliably determine paragraph type using block') options.paragraph_type = 'block' else: log.debug('Auto detected paragraph type as %s' % options.paragraph_type) # Detect formatting if options.formatting_type == 'auto': options.formatting_type = detect_formatting_type(txt) log.debug('Auto detected formatting as %s' % options.formatting_type) if options.formatting_type == 'heuristic': setattr(options, 'enable_heuristics', True) setattr(options, 'unwrap_lines', False) setattr(options, 'smarten_punctuation', True) # Reformat paragraphs to block formatting based on the detected type. # We don't check for block because the processor assumes block. # single and print at transformed to block for processing. if options.paragraph_type == 'single': txt = separate_paragraphs_single_line(txt) elif options.paragraph_type == 'print': txt = separate_hard_scene_breaks(txt) txt = separate_paragraphs_print_formatted(txt) txt = block_to_single_line(txt) elif options.paragraph_type == 'unformatted': from calibre.ebooks.conversion.utils import HeuristicProcessor # unwrap lines based on punctuation docanalysis = DocAnalysis('txt', txt) length = docanalysis.line_length(.5) preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None)) txt = preprocessor.punctuation_unwrap(length, txt, 'txt') txt = separate_paragraphs_single_line(txt) elif options.paragraph_type == 'block': txt = separate_hard_scene_breaks(txt) txt = block_to_single_line(txt) if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False): docanalysis = DocAnalysis('txt', txt) if not length: length = docanalysis.line_length(.5) dehyphenator = Dehyphenator(options.verbose, log=self.log) txt = dehyphenator(txt,'txt', length) # User requested transformation on the text. if options.txt_in_remove_indents: txt = remove_indents(txt) # Preserve spaces will replace multiple spaces to a space # followed by the entity. if options.preserve_spaces: txt = preserve_spaces(txt) # Process the text using the appropriate text processor. self.shifted_files = [] try: html = '' input_mi = None if options.formatting_type == 'markdown': log.debug('Running text through markdown conversion...') try: input_mi, html = convert_markdown_with_metadata(txt, extensions=[x.strip() for x in options.markdown_extensions.split(',') if x.strip()]) except RuntimeError: raise ValueError('This txt file has malformed markup, it cannot be' ' converted by calibre. See https://daringfireball.net/projects/markdown/syntax') html = self.fix_resources(html, base_dir) elif options.formatting_type == 'textile': log.debug('Running text through textile conversion...') html = convert_textile(txt) html = self.fix_resources(html, base_dir) else: log.debug('Running text through basic conversion...') flow_size = getattr(options, 'flow_size', 0) html = convert_basic(txt, epub_split_size_kb=flow_size) # Run the HTMLized text through the html processing plugin. from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) options.input_encoding = 'utf-8' htmlfile = self.shift_file(base_dir, 'index.html', html.encode('utf-8')) odi = options.debug_pipeline options.debug_pipeline = None # Generate oeb from html conversion. oeb = html_input.convert(open(htmlfile, 'rb'), options, 'html', log, {}) options.debug_pipeline = odi finally: for x in self.shifted_files: os.remove(x) # Set metadata from file. if input_mi is None: from calibre.customize.ui import get_file_type_metadata input_mi = get_file_type_metadata(stream, file_ext) from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata meta_info_to_oeb_metadata(input_mi, oeb.metadata, log) self.html_postprocess_title = input_mi.title return oeb def postprocess_book(self, oeb, opts, log): for item in oeb.spine: if hasattr(item.data, 'xpath'): for title in item.data.xpath('//*[local-name()="title"]'): if title.text == _('Unknown'): title.text = self.html_postprocess_title
class FB2Output(OutputFormatPlugin): name = 'FB2 Output' author = 'John Schember' file_type = 'fb2' commit_name = 'fb2_output' FB2_GENRES = [ # Science Fiction & Fantasy 'sf_history', # Alternative history 'sf_action', # Action 'sf_epic', # Epic 'sf_heroic', # Heroic 'sf_detective', # Detective 'sf_cyberpunk', # Cyberpunk 'sf_space', # Space 'sf_social', # Social#philosophical 'sf_horror', # Horror & mystic 'sf_humor', # Humor 'sf_fantasy', # Fantasy 'sf', # Science Fiction # Detectives & Thrillers 'det_classic', # Classical detectives 'det_police', # Police Stories 'det_action', # Action 'det_irony', # Ironical detectives 'det_history', # Historical detectives 'det_espionage', # Espionage detectives 'det_crime', # Crime detectives 'det_political', # Political detectives 'det_maniac', # Maniacs 'det_hard', # Hard#boiled 'thriller', # Thrillers 'detective', # Detectives # Prose 'prose_classic', # Classics prose 'prose_history', # Historical prose 'prose_contemporary', # Contemporary prose 'prose_counter', # Counterculture 'prose_rus_classic', # Russial classics prose 'prose_su_classics', # Soviet classics prose # Romance 'love_contemporary', # Contemporary Romance 'love_history', # Historical Romance 'love_detective', # Detective Romance 'love_short', # Short Romance 'love_erotica', # Erotica # Adventure 'adv_western', # Western 'adv_history', # History 'adv_indian', # Indians 'adv_maritime', # Maritime Fiction 'adv_geo', # Travel & geography 'adv_animal', # Nature & animals 'adventure', # Other # Children's 'child_tale', # Fairy Tales 'child_verse', # Verses 'child_prose', # Prose 'child_sf', # Science Fiction 'child_det', # Detectives & Thrillers 'child_adv', # Adventures 'child_education', # Educational 'children', # Other # Poetry & Dramaturgy 'poetry', # Poetry 'dramaturgy', # Dramaturgy # Antique literature 'antique_ant', # Antique 'antique_european', # European 'antique_russian', # Old russian 'antique_east', # Old east 'antique_myths', # Myths. Legends. Epos 'antique', # Other # Scientific#educational 'sci_history', # History 'sci_psychology', # Psychology 'sci_culture', # Cultural science 'sci_religion', # Religious studies 'sci_philosophy', # Philosophy 'sci_politics', # Politics 'sci_business', # Business literature 'sci_juris', # Jurisprudence 'sci_linguistic', # Linguistics 'sci_medicine', # Medicine 'sci_phys', # Physics 'sci_math', # Mathematics 'sci_chem', # Chemistry 'sci_biology', # Biology 'sci_tech', # Technical 'science', # Other # Computers & Internet 'comp_www', # Internet 'comp_programming', # Programming 'comp_hard', # Hardware 'comp_soft', # Software 'comp_db', # Databases 'comp_osnet', # OS & Networking 'computers', # Other # Reference 'ref_encyc', # Encyclopedias 'ref_dict', # Dictionaries 'ref_ref', # Reference 'ref_guide', # Guidebooks 'reference', # Other # Nonfiction 'nonf_biography', # Biography & Memoirs 'nonf_publicism', # Publicism 'nonf_criticism', # Criticism 'design', # Art & design 'nonfiction', # Other # Religion & Inspiration 'religion_rel', # Religion 'religion_esoterics', # Esoterics 'religion_self', # Self#improvement 'religion', # Other # Humor 'humor_anecdote', # Anecdote (funny stories) 'humor_prose', # Prose 'humor_verse', # Verses 'humor', # Other # Home & Family 'home_cooking', # Cooking 'home_pets', # Pets 'home_crafts', # Hobbies & Crafts 'home_entertain', # Entertaining 'home_health', # Health 'home_garden', # Garden 'home_diy', # Do it yourself 'home_sport', # Sports 'home_sex', # Erotica & sex 'home', # Other ] ui_data = { 'sectionize': { 'toc': _('Section per entry in the ToC'), 'files': _('Section per file'), 'nothing': _('A single section') }, 'genres': FB2_GENRES, } options = { OptionRecommendation( name='sectionize', recommended_value='files', level=OptionRecommendation.LOW, choices=list(ui_data['sectionize']), help=_( 'Specify how sections are created:\n' ' * nothing: {nothing}\n' ' * files: {files}\n' ' * toc: {toc}\n' 'If ToC based generation fails, adjust the "Structure detection" and/or "Table of Contents" settings ' '(turn on "Force use of auto-generated Table of Contents").'). format(**ui_data['sectionize'])), OptionRecommendation( name='fb2_genre', recommended_value='antique', level=OptionRecommendation.LOW, choices=FB2_GENRES, help=(_('Genre for the book. Choices: %s\n\n See: ') % ', '.join(FB2_GENRES)) + 'http://www.fictionbook.org/index.php/Eng:FictionBook_2.1_genres ' + _('for a complete list with descriptions.')), } def convert(self, oeb_book, output_path, input_plugin, opts, log): from calibre.ebooks.oeb.transforms.jacket import linearize_jacket from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable from calibre.ebooks.fb2.fb2ml import FB2MLizer try: rasterizer = SVGRasterizer() rasterizer(oeb_book, opts) except Unavailable: log.warn('SVG rasterizer unavailable, SVG will not be converted') linearize_jacket(oeb_book) fb2mlizer = FB2MLizer(log) fb2_content = fb2mlizer.extract_content(oeb_book, opts) close = False if not hasattr(output_path, 'write'): close = True if not os.path.exists(os.path.dirname( output_path)) and os.path.dirname(output_path) != '': os.makedirs(os.path.dirname(output_path)) out_stream = lopen(output_path, 'wb') else: out_stream = output_path out_stream.seek(0) out_stream.truncate() out_stream.write(fb2_content.encode('utf-8', 'replace')) if close: out_stream.close()
class MOBIOutput(OutputFormatPlugin): name = 'MOBI Output' author = 'Kovid Goyal' file_type = 'mobi' commit_name = 'mobi_output' ui_data = {'file_types': ['old', 'both', 'new']} options = { OptionRecommendation( name='prefer_author_sort', recommended_value=False, level=OptionRecommendation.LOW, help=_('When present, use author sort field as author.')), OptionRecommendation( name='no_inline_toc', recommended_value=False, level=OptionRecommendation.LOW, help=_('Don\'t add Table of Contents to the book. Useful if ' 'the book has its own table of contents.')), OptionRecommendation( name='toc_title', recommended_value=None, help=_('Title for any generated in-line table of contents.')), OptionRecommendation( name='dont_compress', recommended_value=False, level=OptionRecommendation.LOW, help=_('Disable compression of the file contents.')), OptionRecommendation( name='personal_doc', recommended_value='[PDOC]', help=_('Tag for MOBI files to be marked as personal documents.' ' This option has no effect on the conversion. It is used' ' only when sending MOBI files to a device. If the file' ' being sent has the specified tag, it will be marked as' ' a personal document when sent to the Kindle.')), OptionRecommendation( name='mobi_ignore_margins', recommended_value=False, help=_( 'Ignore margins in the input document. If False, then ' 'the MOBI output plugin will try to convert margins specified' ' in the input document, otherwise it will ignore them.')), OptionRecommendation( name='mobi_toc_at_start', recommended_value=False, help=_( 'When adding the Table of Contents to the book, add it at the start of the ' 'book instead of the end. Not recommended.')), OptionRecommendation( name='extract_to', help=_('Extract the contents of the generated %s file to the ' 'specified folder. The contents of the folder are first ' 'deleted, so be careful.') % 'MOBI'), OptionRecommendation( name='share_not_sync', recommended_value=False, help=_('Enable sharing of book content via Facebook etc. ' ' on the Kindle. WARNING: Using this feature means that ' ' the book will not auto sync its last read position ' ' on multiple devices. Complain to Amazon.')), OptionRecommendation( name='mobi_keep_original_images', recommended_value=False, help=_( 'By default calibre converts all images to JPEG format ' 'in the output MOBI file. This is for maximum compatibility ' 'as some older MOBI viewers have problems with other image ' 'formats. This option tells calibre not to do this. ' 'Useful if your document contains lots of GIF/PNG images that ' 'become very large when converted to JPEG.')), OptionRecommendation( name='mobi_file_type', choices=ui_data['file_types'], recommended_value='old', help=_( 'By default calibre generates MOBI files that contain the ' 'old MOBI 6 format. This format is compatible with all ' 'devices. However, by changing this setting, you can tell ' 'calibre to generate MOBI files that contain both MOBI 6 and ' 'the new KF8 format, or only the new KF8 format. KF8 has ' 'more features than MOBI 6, but only works with newer Kindles. ' 'Allowed values: {}').format('old, both, new')), } def check_for_periodical(self): if self.is_periodical: self.periodicalize_toc() self.check_for_masthead() self.opts.mobi_periodical = True else: self.opts.mobi_periodical = False def check_for_masthead(self): found = 'masthead' in self.oeb.guide if not found: from calibre.ebooks import generate_masthead self.oeb.log.debug( 'No masthead found in manifest, generating default mastheadImage...' ) raw = generate_masthead(unicode_type( self.oeb.metadata['title'][0])) id, href = self.oeb.manifest.generate('masthead', 'masthead') self.oeb.manifest.add(id, href, 'image/gif', data=raw) self.oeb.guide.add('masthead', 'Masthead Image', href) else: self.oeb.log.debug('Using mastheadImage supplied in manifest...') def periodicalize_toc(self): from calibre.ebooks.oeb.base import TOC toc = self.oeb.toc if not toc or len(self.oeb.spine) < 3: return if toc and toc[0].klass != 'periodical': one, two = self.oeb.spine[0], self.oeb.spine[1] self.log('Converting TOC for MOBI periodical indexing...') articles = {} if toc.depth() < 3: # single section periodical self.oeb.manifest.remove(one) self.oeb.manifest.remove(two) sections = [ TOC(klass='section', title=_('All articles'), href=self.oeb.spine[0].href) ] for x in toc: sections[0].nodes.append(x) else: # multi-section periodical self.oeb.manifest.remove(one) sections = list(toc) for i, x in enumerate(sections): x.klass = 'section' articles_ = list(x) if articles_: self.oeb.manifest.remove( self.oeb.manifest.hrefs[x.href]) x.href = articles_[0].href for sec in sections: articles[id(sec)] = [] for a in list(sec): a.klass = 'article' articles[id(sec)].append(a) sec.nodes.remove(a) root = TOC(klass='periodical', href=self.oeb.spine[0].href, title=unicode_type(self.oeb.metadata.title[0])) for s in sections: if articles[id(s)]: for a in articles[id(s)]: s.nodes.append(a) root.nodes.append(s) for x in list(toc.nodes): toc.nodes.remove(x) toc.nodes.append(root) # Fix up the periodical href to point to first section href toc.nodes[0].href = toc.nodes[0].nodes[0].href def convert(self, oeb, output_path, input_plugin, opts, log): from calibre.ebooks.mobi.writer2.resources import Resources self.log, self.opts, self.oeb = log, opts, oeb mobi_type = opts.mobi_file_type if self.is_periodical: mobi_type = 'old' # Amazon does not support KF8 periodicals create_kf8 = mobi_type in ('new', 'both') remove_html_cover(self.oeb, self.log) resources = Resources(oeb, opts, self.is_periodical, add_fonts=create_kf8) self.check_for_periodical() if create_kf8: from calibre.ebooks.mobi.writer8.cleanup import remove_duplicate_anchors remove_duplicate_anchors(self.oeb) # Split on pagebreaks so that the resulting KF8 is faster to load from calibre.ebooks.oeb.transforms.split import Split Split()(self.oeb, self.opts) kf8 = self.create_kf8(resources, for_joint=mobi_type == 'both') if create_kf8 else None if mobi_type == 'new': kf8.write(output_path) extract_mobi(output_path, opts) return self.log('Creating MOBI 6 output') self.write_mobi(input_plugin, output_path, kf8, resources) def create_kf8(self, resources, for_joint=False): from calibre.ebooks.mobi.writer8.main import create_kf8_book return create_kf8_book(self.oeb, self.opts, resources, for_joint=for_joint) def write_mobi(self, input_plugin, output_path, kf8, resources): from calibre.ebooks.mobi.mobiml import MobiMLizer from calibre.ebooks.oeb.transforms.manglecase import CaseMangler from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable from calibre.ebooks.oeb.transforms.htmltoc import HTMLTOCAdder from calibre.customize.ui import plugin_for_input_format opts, oeb = self.opts, self.oeb if not opts.no_inline_toc: tocadder = HTMLTOCAdder( title=opts.toc_title, position='start' if opts.mobi_toc_at_start else 'end') tocadder(oeb, opts) mangler = CaseMangler() mangler(oeb, opts) try: rasterizer = SVGRasterizer() rasterizer(oeb, opts) except Unavailable: self.log.warn( 'SVG rasterizer unavailable, SVG will not be converted') else: # Add rasterized SVG images resources.add_extra_images() if hasattr(self.oeb, 'inserted_metadata_jacket'): self.workaround_fire_bugs(self.oeb.inserted_metadata_jacket) mobimlizer = MobiMLizer(ignore_tables=opts.linearize_tables) mobimlizer(oeb, opts) write_page_breaks_after_item = input_plugin is not plugin_for_input_format( 'cbz') from calibre.ebooks.mobi.writer2.main import MobiWriter writer = MobiWriter( opts, resources, kf8, write_page_breaks_after_item=write_page_breaks_after_item) writer(oeb, output_path) extract_mobi(output_path, opts) def specialize_css_for_output(self, log, opts, item, stylizer): from calibre.ebooks.mobi.writer8.cleanup import CSSCleanup CSSCleanup(log, opts)(item, stylizer) def workaround_fire_bugs(self, jacket): # The idiotic Fire crashes when trying to render the table used to # layout the jacket from calibre.ebooks.oeb.base import XHTML for table in jacket.data.xpath('//*[local-name()="table"]'): table.tag = XHTML('div') for tr in table.xpath('descendant::*[local-name()="tr"]'): cols = tr.xpath('descendant::*[local-name()="td"]') tr.tag = XHTML('div') for td in cols: td.tag = XHTML('span' if cols else 'div')
class AZW3Output(OutputFormatPlugin): name = 'AZW3 Output' author = 'Kovid Goyal' file_type = 'azw3' commit_name = 'azw3_output' options = { OptionRecommendation( name='prefer_author_sort', recommended_value=False, level=OptionRecommendation.LOW, help=_('When present, use author sort field as author.')), OptionRecommendation( name='no_inline_toc', recommended_value=False, level=OptionRecommendation.LOW, help=_('Don\'t add Table of Contents to the book. Useful if ' 'the book has its own table of contents.')), OptionRecommendation( name='toc_title', recommended_value=None, help=_('Title for any generated in-line table of contents.')), OptionRecommendation( name='dont_compress', recommended_value=False, level=OptionRecommendation.LOW, help=_('Disable compression of the file contents.')), OptionRecommendation( name='mobi_toc_at_start', recommended_value=False, help=_( 'When adding the Table of Contents to the book, add it at the start of the ' 'book instead of the end. Not recommended.')), OptionRecommendation( name='extract_to', help=_('Extract the contents of the generated %s file to the ' 'specified folder. The contents of the folder are first ' 'deleted, so be careful.') % 'AZW3'), OptionRecommendation( name='share_not_sync', recommended_value=False, help=_('Enable sharing of book content via Facebook etc. ' ' on the Kindle. WARNING: Using this feature means that ' ' the book will not auto sync its last read position ' ' on multiple devices. Complain to Amazon.')), } def convert(self, oeb, output_path, input_plugin, opts, log): from calibre.ebooks.mobi.writer2.resources import Resources from calibre.ebooks.mobi.writer8.main import create_kf8_book from calibre.ebooks.mobi.writer8.cleanup import remove_duplicate_anchors self.oeb, self.opts, self.log = oeb, opts, log opts.mobi_periodical = self.is_periodical passthrough = getattr(opts, 'mobi_passthrough', False) remove_duplicate_anchors(oeb) resources = Resources(self.oeb, self.opts, self.is_periodical, add_fonts=True, process_images=False) if not passthrough: remove_html_cover(self.oeb, self.log) # Split on pagebreaks so that the resulting KF8 is faster to load from calibre.ebooks.oeb.transforms.split import Split Split()(self.oeb, self.opts) kf8 = create_kf8_book(self.oeb, self.opts, resources, for_joint=False) kf8.write(output_path) extract_mobi(output_path, opts) def specialize_css_for_output(self, log, opts, item, stylizer): from calibre.ebooks.mobi.writer8.cleanup import CSSCleanup CSSCleanup(log, opts)(item, stylizer)
class KEPUBInput(EPUBInput): """Extension of calibre's EPUBInput to understand KePub format books.""" name = "KePub Input" description = "Convert KEPUB files (.kepub) to HTML" author = "David Forrester" file_types = {"kepub"} version = plugin_version minimum_calibre_version = (0, 1, 0) options = { OptionRecommendation( name="strip_kobo_spans", recommended_value=True, help=_( # noqa: F821 "Kepubs have spans wrapping each sentence. These are used by " "the ereader for the reading location and bookmark location. " "They are not used by an ePub reader but are valid code and " "can be safely be left in the ePub. If you plan to edit the " "ePub, it is recommended that you remove the spans."), ) } recommendations = set([]) def gui_configuration_widget(self, parent, get_option_by_name, get_option_help, db, book_id=None): """Set up the input processor's configuration widget.""" from calibre_plugins.kepubin.conversion.input_config import PluginWidget return PluginWidget(parent, get_option_by_name, get_option_help, db, book_id) def convert(self, stream, options, file_ext, log, accelerators): """Convert a KePub file into a structure calibre can process.""" log("KEPUBInput::convert - start") from calibre.utils.zipfile import ZipFile from calibre import walk from calibre.ebooks import DRMError from calibre.ebooks.metadata.opf2 import OPF try: zf = ZipFile(stream) cwd = os.getcwdu() if sys.version_info.major == 2 else os.getcwd() zf.extractall(cwd) except Exception: log.exception("KEPUB appears to be invalid ZIP file, trying a " "more forgiving ZIP parser") from calibre.utils.localunzip import extractall stream.seek(0) extractall(stream) opf = self.find_opf() if opf is None: for f in walk("."): if (f.lower().endswith(".opf") and "__MACOSX" not in f and not os.path.basename(f).startswith(".")): opf = os.path.abspath(f) break path = getattr(stream, "name", "stream") if opf is None: raise ValueError( _( # noqa: F821 "{0} is not a valid KEPUB file (could not find opf)"). format(path)) encfile = os.path.abspath("rights.xml") if os.path.exists(encfile): raise DRMError(os.path.basename(path)) cwd = os.getcwdu() if sys.version_info.major == 2 else os.getcwd() opf = os.path.relpath(opf, cwd) parts = os.path.split(opf) opf = OPF(opf, os.path.dirname(os.path.abspath(opf))) self.encrypted_fonts = [] if len(parts) > 1 and parts[0]: delta = "/".join(parts[:-1]) + "/" for elem in opf.itermanifest(): elem.set("href", delta + elem.get("href")) for elem in opf.iterguide(): elem.set("href", delta + elem.get("href")) f = (self.rationalize_cover3 if opf.package_version >= 3.0 else self.rationalize_cover2) self.removed_cover = f(opf, log) self.optimize_opf_parsing = opf for x in opf.itermanifest(): if x.get("media-type", "") == "application/x-dtbook+xml": raise ValueError( _("EPUB files with DTBook markup are not supported" ) # noqa: F821 ) not_for_spine = set() for y in opf.itermanifest(): id_ = y.get("id", None) if id_ and y.get("media-type", None) in { "application/vnd.adobe-page-template+xml", "application/vnd.adobe.page-template+xml", "application/adobe-page-template+xml", "application/adobe.page-template+xml", "application/text", }: not_for_spine.add(id_) seen = set() for x in list(opf.iterspine()): ref = x.get("idref", None) if not ref or ref in not_for_spine or ref in seen: x.getparent().remove(x) continue seen.add(ref) if len(list(opf.iterspine())) == 0: raise ValueError( _("No valid entries in the spine of this EPUB") # noqa: F821 ) with open("content.opf", "wb") as nopf: nopf.write(opf.render()) return os.path.abspath("content.opf") def postprocess_book(self, oeb, opts, log): """Perform any needed post-input processing on the book.""" log("KEPUBInput::postprocess_book - start") from calibre.ebooks.oeb.base import XHTML_NS # The Kobo spans wrap each sentence. Remove them and add their text to # the parent tag. def refactor_span(a): p = a.getparent() idx = p.index(a) - 1 p.remove(a) if idx < 0: if p.text is None: p.text = "" p.text += a.text if a.text else "" p.text += a.tail if a.tail else "" else: if p[idx].tail is None: p[idx].tail = "" p[idx].tail += a.text if a.text else "" p[idx].tail += a.tail if a.tail else "" super(KEPUBInput, self).postprocess_book(oeb, opts, log) if not opts.strip_kobo_spans: log("KEPUBInput::postprocess_book - not stripping kobo spans") return for item in oeb.spine: log("item.__class__.__name__", item.__class__.__name__) if not hasattr(item.data, "xpath"): continue for a in item.data.xpath('//h:span[@class="koboSpan"]', namespaces={"h": XHTML_NS}): refactor_span(a) log("KEPUBInput::postprocess_book - end") # Shouldn't get called, but overriding just in case. def process_encryption(self, encfile, opf, log): """Determine if encryption needs to be processed.""" encfile = os.path.abspath("rights.xml") return not os.path.exists(encfile)
class PDFOutput(OutputFormatPlugin): name = 'PDF Output' author = 'Kovid Goyal' file_type = 'pdf' options = set([ OptionRecommendation( name='override_profile_size', recommended_value=False, help=_('Normally, the PDF page size is set by the output profile' ' chosen under page options. This option will cause the ' ' page size settings under PDF Output to override the ' ' size specified by the output profile.')), OptionRecommendation( name='unit', recommended_value='inch', level=OptionRecommendation.LOW, short_switch='u', choices=UNITS, help=_( 'The unit of measure for page sizes. Default is inch. Choices ' 'are %s ' 'Note: This does not override the unit for margins!') % UNITS), OptionRecommendation( name='paper_size', recommended_value='letter', level=OptionRecommendation.LOW, choices=PAPER_SIZES, help= _('The size of the paper. This size will be overridden when a ' 'non default output profile is used. Default is letter. Choices ' 'are %s') % PAPER_SIZES), OptionRecommendation( name='custom_size', recommended_value=None, help=_('Custom size of the document. Use the form widthxheight ' 'EG. `123x321` to specify the width and height. ' 'This overrides any specified paper-size.')), OptionRecommendation( name='preserve_cover_aspect_ratio', recommended_value=False, help=_('Preserve the aspect ratio of the cover, instead' ' of stretching it to fill the full first page of the' ' generated pdf.')), OptionRecommendation( name='pdf_serif_family', recommended_value='Liberation Serif' if islinux else 'Times New Roman', help=_('The font family used to render serif fonts')), OptionRecommendation( name='pdf_sans_family', recommended_value='Liberation Sans' if islinux else 'Helvetica', help=_('The font family used to render sans-serif fonts')), OptionRecommendation( name='pdf_mono_family', recommended_value='Liberation Mono' if islinux else 'Courier New', help=_('The font family used to render monospaced fonts')), OptionRecommendation( name='pdf_standard_font', choices=['serif', 'sans', 'mono'], recommended_value='serif', help=_('The font family used to render monospaced fonts')), OptionRecommendation(name='pdf_default_font_size', recommended_value=20, help=_('The default font size')), OptionRecommendation( name='pdf_mono_font_size', recommended_value=16, help=_('The default font size for monospaced text')), OptionRecommendation( name='pdf_mark_links', recommended_value=False, help=_( 'Surround all links with a red box, useful for debugging.')), OptionRecommendation( name='old_pdf_engine', recommended_value=False, help=_('Use the old, less capable engine to generate the PDF')), OptionRecommendation( name='uncompressed_pdf', recommended_value=False, help=_('Generate an uncompressed PDF, useful for debugging, ' 'only works with the new PDF engine.')), OptionRecommendation( name='pdf_page_numbers', recommended_value=False, help=_( 'Add page numbers to the bottom of every page in the generated PDF file. If you ' 'specify a footer template, it will take precedence ' 'over this option.')), OptionRecommendation( name='pdf_footer_template', recommended_value=None, help= _('An HTML template used to generate %s on every page.' ' The strings _PAGENUM_, _TITLE_, _AUTHOR_ and _SECTION_ will be replaced by their current values.' ) % _('footers')), OptionRecommendation( name='pdf_header_template', recommended_value=None, help= _('An HTML template used to generate %s on every page.' ' The strings _PAGENUM_, _TITLE_, _AUTHOR_ and _SECTION_ will be replaced by their current values.' ) % _('headers')), OptionRecommendation( name='pdf_add_toc', recommended_value=False, help= _('Add a Table of Contents at the end of the PDF that lists page numbers. ' 'Useful if you want to print out the PDF. If this PDF is intended for electronic use, use the PDF Outline instead.' )), OptionRecommendation(name='toc_title', recommended_value=None, help=_('Title for generated table of contents.')), ]) def convert(self, oeb_book, output_path, input_plugin, opts, log): from calibre.gui2 import must_use_qt, load_builtin_fonts must_use_qt() load_builtin_fonts() self.oeb = oeb_book self.input_plugin, self.opts, self.log = input_plugin, opts, log self.output_path = output_path from calibre.ebooks.oeb.base import OPF, OPF2_NS from lxml import etree from io import BytesIO package = etree.Element(OPF('package'), attrib={ 'version': '2.0', 'unique-identifier': 'dummy' }, nsmap={None: OPF2_NS}) from calibre.ebooks.metadata.opf2 import OPF self.oeb.metadata.to_opf2(package) self.metadata = OPF(BytesIO( etree.tostring(package))).to_book_metadata() self.cover_data = None if input_plugin.is_image_collection: log.debug('Converting input as an image collection...') self.convert_images(input_plugin.get_images()) else: log.debug('Converting input as a text based book...') self.convert_text(oeb_book) def convert_images(self, images): from calibre.ebooks.pdf.writer import ImagePDFWriter self.write(ImagePDFWriter, images, None) def get_cover_data(self): oeb = self.oeb if (oeb.metadata.cover and unicode(oeb.metadata.cover[0]) in oeb.manifest.ids): cover_id = unicode(oeb.metadata.cover[0]) item = oeb.manifest.ids[cover_id] self.cover_data = item.data def handle_embedded_fonts(self): ''' Because of QtWebKit's inability to handle embedded fonts correctly, we remove the embedded fonts and make them available system wide instead. If you ever move to Qt WebKit 2.3+ then this will be unnecessary. ''' from calibre.ebooks.oeb.base import urlnormalize from calibre.utils.fonts.utils import remove_embed_restriction from PyQt4.Qt import QFontDatabase, QByteArray, QRawFont, QFont # First find all @font-face rules and remove them, adding the embedded # fonts to Qt family_map = {} for item in list(self.oeb.manifest): if not hasattr(item.data, 'cssRules'): continue remove = set() for i, rule in enumerate(item.data.cssRules): if rule.type == rule.FONT_FACE_RULE: remove.add(i) try: s = rule.style src = s.getProperty('src').propertyValue[0].uri font_family = s.getProperty( 'font-family').propertyValue[0].value except: continue path = item.abshref(src) ff = self.oeb.manifest.hrefs.get(urlnormalize(path), None) if ff is None: continue raw = ff.data self.oeb.manifest.remove(ff) try: raw = remove_embed_restriction(raw) except: continue fid = QFontDatabase.addApplicationFontFromData( QByteArray(raw)) family_name = None if fid > -1: try: family_name = unicode( QFontDatabase.applicationFontFamilies(fid)[0]) except (IndexError, KeyError): pass if family_name: family_map[icu_lower(font_family)] = family_name for i in sorted(remove, reverse=True): item.data.cssRules.pop(i) # Now map the font family name specified in the css to the actual # family name of the embedded font (they may be different in general). font_warnings = set() for item in self.oeb.manifest: if not hasattr(item.data, 'cssRules'): continue for i, rule in enumerate(item.data.cssRules): if rule.type != rule.STYLE_RULE: continue ff = rule.style.getProperty('font-family') if ff is None: continue val = ff.propertyValue for i in xrange(val.length): try: k = icu_lower(val[i].value) except (AttributeError, TypeError): val[i].value = k = 'times' if k in family_map: val[i].value = family_map[k] if iswindows: # On windows, Qt uses GDI which does not support OpenType # (CFF) fonts, so we need to nuke references to OpenType # fonts. Note that you could compile QT with configure # -directwrite, but that requires atleast Vista SP2 for i in xrange(val.length): family = val[i].value if family: f = QRawFont.fromFont(QFont(family)) if len(f.fontTable('head')) == 0: if family not in font_warnings: self.log.warn( 'Ignoring unsupported font: %s' % family) font_warnings.add(family) # Either a bitmap or (more likely) a CFF font val[i].value = 'times' def convert_text(self, oeb_book): from calibre.ebooks.metadata.opf2 import OPF if self.opts.old_pdf_engine: from calibre.ebooks.pdf.writer import PDFWriter PDFWriter else: from calibre.ebooks.pdf.render.from_html import PDFWriter self.log.debug('Serializing oeb input to disk for processing...') self.get_cover_data() self.handle_embedded_fonts() with TemporaryDirectory('_pdf_out') as oeb_dir: from calibre.customize.ui import plugin_for_output_format oeb_output = plugin_for_output_format('oeb') oeb_output.convert(oeb_book, oeb_dir, self.input_plugin, self.opts, self.log) opfpath = glob.glob(os.path.join(oeb_dir, '*.opf'))[0] opf = OPF(opfpath, os.path.dirname(opfpath)) self.write(PDFWriter, [s.path for s in opf.spine], getattr(opf, 'toc', None)) def write(self, Writer, items, toc): writer = Writer(self.opts, self.log, cover_data=self.cover_data, toc=toc) writer.report_progress = self.report_progress close = False if not hasattr(self.output_path, 'write'): close = True if not os.path.exists(os.path.dirname( self.output_path)) and os.path.dirname( self.output_path) != '': os.makedirs(os.path.dirname(self.output_path)) out_stream = open(self.output_path, 'wb') else: out_stream = self.output_path out_stream.seek(0) out_stream.truncate() self.log.debug('Rendering pages to PDF...') import time st = time.time() if False: import cProfile cProfile.runctx( 'writer.dump(items, out_stream, PDFMetadata(self.metadata))', globals(), locals(), '/tmp/profile') else: writer.dump(items, out_stream, PDFMetadata(self.metadata)) self.log('Rendered PDF in %g seconds:' % (time.time() - st)) if close: out_stream.close() def specialize_css_for_output(self, log, opts, item, stylizer): ''' Qt WebKit (4.8.x) cannot handle font-variant: small-caps. It tries to fake the small caps, which is ok, but the faking continues on to subsequent text that should not be in small-caps. So we workaround the problem by faking small caps ourselves. A minimal example that Qt chokes on: <html><body> <p style="font-variant:small-caps">Some Small-caps Text</p> <p style="text-align:justify">Some non small-caps text with enough text for at least one full line and justification enabled. Both of these are needed for the example to work.</p> </body></html> ''' from calibre.ebooks.oeb.base import XHTML import itertools, string if not hasattr(item.data, 'xpath'): return ws = unicode(string.whitespace) def fake_small_caps(elem): spans = [] for lowercase, textiter in itertools.groupby( elem.text, lambda x: x not in ws and icu_lower(x) == x): text = ''.join(textiter) if lowercase: text = icu_upper(text) span = elem.makeelement(XHTML('span')) span.text = text style = stylizer.style(span) if lowercase: style.set('font-size', '0.65em') spans.append(span) elem.text = None elem[0:] = spans def process_elem(elem, parent_fv=None): children = tuple(elem) style = stylizer.style(elem) fv = style.drop('font-variant') if not fv or fv.lower() == 'inherit': fv = parent_fv if fv and fv.lower() in {'smallcaps', 'small-caps'}: if elem.text: fake_small_caps(elem) for child in children: if hasattr(getattr(child, 'tag', None), 'lower'): process_elem(child, parent_fv=fv) for body in item.data.xpath('//*[local-name()="body"]'): process_elem(body)
class PDFOutput(OutputFormatPlugin): name = 'PDF Output' author = 'Kovid Goyal' file_type = 'pdf' commit_name = 'pdf_output' ui_data = {'paper_sizes': PAPER_SIZES, 'units': UNITS, 'font_types': ('serif', 'sans', 'mono')} options = { OptionRecommendation(name='use_profile_size', recommended_value=False, help=_('Instead of using the paper size specified in the PDF Output options,' ' use a paper size corresponding to the current output profile.' ' Useful if you want to generate a PDF for viewing on a specific device.')), OptionRecommendation(name='unit', recommended_value='inch', level=OptionRecommendation.LOW, short_switch='u', choices=UNITS, help=_('The unit of measure for page sizes. Default is inch. Choices ' 'are {} ' 'Note: This does not override the unit for margins!').format(', '.join(UNITS))), OptionRecommendation(name='paper_size', recommended_value='letter', level=OptionRecommendation.LOW, choices=PAPER_SIZES, help=_('The size of the paper. This size will be overridden when a ' 'non default output profile is used. Default is letter. Choices ' 'are {}').format(', '.join(PAPER_SIZES))), OptionRecommendation(name='custom_size', recommended_value=None, help=_('Custom size of the document. Use the form widthxheight ' 'e.g. `123x321` to specify the width and height. ' 'This overrides any specified paper-size.')), OptionRecommendation(name='preserve_cover_aspect_ratio', recommended_value=False, help=_('Preserve the aspect ratio of the cover, instead' ' of stretching it to fill the full first page of the' ' generated PDF.')), OptionRecommendation(name='pdf_serif_family', recommended_value='Times', help=_( 'The font family used to render serif fonts. Will work only if the font is available system-wide.')), OptionRecommendation(name='pdf_sans_family', recommended_value='Helvetica', help=_( 'The font family used to render sans-serif fonts. Will work only if the font is available system-wide.')), OptionRecommendation(name='pdf_mono_family', recommended_value='Courier', help=_( 'The font family used to render monospace fonts. Will work only if the font is available system-wide.')), OptionRecommendation(name='pdf_standard_font', choices=ui_data['font_types'], recommended_value='serif', help=_( 'The font family used to render monospace fonts')), OptionRecommendation(name='pdf_default_font_size', recommended_value=20, help=_( 'The default font size')), OptionRecommendation(name='pdf_mono_font_size', recommended_value=16, help=_( 'The default font size for monospaced text')), OptionRecommendation(name='pdf_hyphenate', recommended_value=False, help=_('Break long words at the end of lines. This can give the text at the right margin a more even appearance.')), OptionRecommendation(name='pdf_mark_links', recommended_value=False, help=_('Surround all links with a red box, useful for debugging.')), OptionRecommendation(name='pdf_page_numbers', recommended_value=False, help=_('Add page numbers to the bottom of every page in the generated PDF file. If you ' 'specify a footer template, it will take precedence ' 'over this option.')), OptionRecommendation(name='pdf_footer_template', recommended_value=None, help=_('An HTML template used to generate %s on every page.' ' The strings _PAGENUM_, _TITLE_, _AUTHOR_ and _SECTION_ will be replaced by their current values.')%_('footers')), OptionRecommendation(name='pdf_header_template', recommended_value=None, help=_('An HTML template used to generate %s on every page.' ' The strings _PAGENUM_, _TITLE_, _AUTHOR_ and _SECTION_ will be replaced by their current values.')%_('headers')), OptionRecommendation(name='pdf_add_toc', recommended_value=False, help=_('Add a Table of Contents at the end of the PDF that lists page numbers. ' 'Useful if you want to print out the PDF. If this PDF is intended for electronic use, use the PDF Outline instead.')), OptionRecommendation(name='toc_title', recommended_value=None, help=_('Title for generated table of contents.') ), OptionRecommendation(name='pdf_page_margin_left', recommended_value=72.0, level=OptionRecommendation.LOW, help=_('The size of the left page margin, in pts. Default is 72pt.' ' Overrides the common left page margin setting.') ), OptionRecommendation(name='pdf_page_margin_top', recommended_value=72.0, level=OptionRecommendation.LOW, help=_('The size of the top page margin, in pts. Default is 72pt.' ' Overrides the common top page margin setting, unless set to zero.') ), OptionRecommendation(name='pdf_page_margin_right', recommended_value=72.0, level=OptionRecommendation.LOW, help=_('The size of the right page margin, in pts. Default is 72pt.' ' Overrides the common right page margin setting, unless set to zero.') ), OptionRecommendation(name='pdf_page_margin_bottom', recommended_value=72.0, level=OptionRecommendation.LOW, help=_('The size of the bottom page margin, in pts. Default is 72pt.' ' Overrides the common bottom page margin setting, unless set to zero.') ), OptionRecommendation(name='pdf_use_document_margins', recommended_value=False, help=_('Use the page margins specified in the input document via @page CSS rules.' ' This will cause the margins specified in the conversion settings to be ignored.' ' If the document does not specify page margins, the conversion settings will be used as a fallback.') ), OptionRecommendation(name='pdf_page_number_map', recommended_value=None, help=_('Adjust page numbers, as needed. Syntax is a JavaScript expression for the page number.' ' For example, "if (n < 3) 0; else n - 3;", where n is current page number.') ), OptionRecommendation(name='uncompressed_pdf', recommended_value=False, help=_( 'Generate an uncompressed PDF, useful for debugging.') ), OptionRecommendation(name='pdf_odd_even_offset', recommended_value=0.0, level=OptionRecommendation.LOW, help=_( 'Shift the text horizontally by the specified offset (in pts).' ' On odd numbered pages, it is shifted to the right and on even' ' numbered pages to the left. Use negative numbers for the opposite' ' effect. Note that this setting is ignored on pages where the margins' ' are smaller than the specified offset. Shifting is done by setting' ' the PDF CropBox, not all software respects the CropBox.' ) ), } def specialize_options(self, log, opts, input_fmt): # Ensure Qt is setup to be used with WebEngine # specialize_options is called early enough in the pipeline # that hopefully no Qt application has been constructed as yet from qt.webengine import QWebEngineUrlScheme from qt.webengine import QWebEnginePage # noqa from calibre.gui2 import must_use_qt from calibre.constants import FAKE_PROTOCOL scheme = QWebEngineUrlScheme(FAKE_PROTOCOL.encode('ascii')) scheme.setSyntax(QWebEngineUrlScheme.Syntax.Host) scheme.setFlags(QWebEngineUrlScheme.Flag.SecureScheme) QWebEngineUrlScheme.registerScheme(scheme) must_use_qt() self.input_fmt = input_fmt if opts.pdf_use_document_margins: # Prevent the conversion pipeline from overwriting document margins opts.margin_left = opts.margin_right = opts.margin_top = opts.margin_bottom = -1 def convert(self, oeb_book, output_path, input_plugin, opts, log): self.stored_page_margins = getattr(opts, '_stored_page_margins', {}) self.oeb = oeb_book self.input_plugin, self.opts, self.log = input_plugin, opts, log self.output_path = output_path from calibre.ebooks.oeb.base import OPF, OPF2_NS from lxml import etree from io import BytesIO package = etree.Element(OPF('package'), attrib={'version': '2.0', 'unique-identifier': 'dummy'}, nsmap={None: OPF2_NS}) from calibre.ebooks.metadata.opf2 import OPF self.oeb.metadata.to_opf2(package) self.metadata = OPF(BytesIO(etree.tostring(package))).to_book_metadata() self.cover_data = None if input_plugin.is_image_collection: log.debug('Converting input as an image collection...') self.convert_images(input_plugin.get_images()) else: log.debug('Converting input as a text based book...') self.convert_text(oeb_book) def convert_images(self, images): from calibre.ebooks.pdf.image_writer import convert convert(images, self.output_path, self.opts, self.metadata, self.report_progress) def get_cover_data(self): oeb = self.oeb if (oeb.metadata.cover and unicode_type(oeb.metadata.cover[0]) in oeb.manifest.ids): cover_id = unicode_type(oeb.metadata.cover[0]) item = oeb.manifest.ids[cover_id] if isinstance(item.data, bytes): self.cover_data = item.data def process_fonts(self): ''' Make sure all fonts are embeddable ''' from calibre.ebooks.oeb.base import urlnormalize from calibre.utils.fonts.utils import remove_embed_restriction processed = set() for item in list(self.oeb.manifest): if not hasattr(item.data, 'cssRules'): continue for i, rule in enumerate(item.data.cssRules): if rule.type == rule.FONT_FACE_RULE: try: s = rule.style src = s.getProperty('src').propertyValue[0].uri except: continue path = item.abshref(src) ff = self.oeb.manifest.hrefs.get(urlnormalize(path), None) if ff is None: continue raw = nraw = ff.data if path not in processed: processed.add(path) try: nraw = remove_embed_restriction(raw) except: continue if nraw != raw: ff.data = nraw self.oeb.container.write(path, nraw) def convert_text(self, oeb_book): import json from calibre.ebooks.pdf.html_writer import convert self.get_cover_data() self.process_fonts() if self.opts.pdf_use_document_margins and self.stored_page_margins: for href, margins in iteritems(self.stored_page_margins): item = oeb_book.manifest.hrefs.get(href) if item is not None: root = item.data if hasattr(root, 'xpath') and margins: root.set('data-calibre-pdf-output-page-margins', json.dumps(margins)) with TemporaryDirectory('_pdf_out') as oeb_dir: from calibre.customize.ui import plugin_for_output_format oeb_dir = os.path.realpath(oeb_dir) oeb_output = plugin_for_output_format('oeb') oeb_output.convert(oeb_book, oeb_dir, self.input_plugin, self.opts, self.log) opfpath = glob.glob(os.path.join(oeb_dir, '*.opf'))[0] convert( opfpath, self.opts, metadata=self.metadata, output_path=self.output_path, log=self.log, cover_data=self.cover_data, report_progress=self.report_progress )
class KEPUBInput(EPUBInput): name = 'KePub Input' description = 'Convert KEPUB files (.kepub) to HTML' author = 'David Forrester' file_types = set(['kepub']) version = plugin_version minimum_calibre_version = (0, 1, 0) options = { OptionRecommendation( name='strip_kobo_spans', recommended_value=True, help= _('Kepubs have spans wrapping each sentence. These are used by the ereader for the reading location ' 'and bookmark location. They are not used by an ePub reader but are valid code and can be safely be ' 'left in the ePub. If you plan to edit the ePub, it is recommended that you remove the spans.' )), } recommendations = set([]) def gui_configuration_widget(self, parent, get_option_by_name, get_option_help, db, book_id=None): from calibre_plugins.kepubin.conversion.input_config import PluginWidget return PluginWidget(parent, get_option_by_name, get_option_help, db, book_id) def convert(self, stream, options, file_ext, log, accelerators): log("KEPUBInput::convert - start") from calibre.utils.zipfile import ZipFile from calibre import walk from calibre.ebooks import DRMError from calibre.ebooks.metadata.opf2 import OPF try: zf = ZipFile(stream) zf.extractall(os.getcwdu()) except: log.exception('KEPUB appears to be invalid ZIP file, trying a ' 'more forgiving ZIP parser') from calibre.utils.localunzip import extractall stream.seek(0) extractall(stream) opf = self.find_opf() if opf is None: for f in walk(u'.'): if f.lower().endswith('.opf') and '__MACOSX' not in f and \ not os.path.basename(f).startswith('.'): opf = os.path.abspath(f) break path = getattr(stream, 'name', 'stream') if opf is None: raise ValueError( _('%s is not a valid KEPUB file (could not find opf)') % path) encfile = os.path.abspath('rights.xml') if os.path.exists(encfile): raise DRMError(os.path.basename(path)) opf = os.path.relpath(opf, os.getcwdu()) parts = os.path.split(opf) opf = OPF(opf, os.path.dirname(os.path.abspath(opf))) self.encrypted_fonts = [] if len(parts) > 1 and parts[0]: delta = '/'.join(parts[:-1]) + '/' for elem in opf.itermanifest(): elem.set('href', delta + elem.get('href')) for elem in opf.iterguide(): elem.set('href', delta + elem.get('href')) f = self.rationalize_cover3 if opf.package_version >= 3.0 else \ self.rationalize_cover2 self.removed_cover = f(opf, log) self.optimize_opf_parsing = opf for x in opf.itermanifest(): if x.get('media-type', '') == 'application/x-dtbook+xml': raise ValueError( _('EPUB files with DTBook markup are not supported')) not_for_spine = set() for y in opf.itermanifest(): id_ = y.get('id', None) if id_ and y.get('media-type', None) in { 'application/vnd.adobe-page-template+xml', 'application/vnd.adobe.page-template+xml', 'application/adobe-page-template+xml', 'application/adobe.page-template+xml', 'application/text' }: not_for_spine.add(id_) seen = set() for x in list(opf.iterspine()): ref = x.get('idref', None) if not ref or ref in not_for_spine or ref in seen: x.getparent().remove(x) continue seen.add(ref) if len(list(opf.iterspine())) == 0: raise ValueError(_('No valid entries in the spine of this EPUB')) with open('content.opf', 'wb') as nopf: nopf.write(opf.render()) return os.path.abspath(u'content.opf') def postprocess_book(self, oeb, opts, log): log("KEPUBInput::postprocess_book - start") from calibre.ebooks.oeb.base import XHTML_NS # The Kobo spans wrap each sentence. Remove them and add their text to # the parent tag. def refactor_span(a): p = a.getparent() idx = p.index(a) - 1 p.remove(a) if idx < 0: if p.text is None: p.text = '' p.text += a.text if a.text else '' p.text += a.tail if a.tail else '' else: if p[idx].tail is None: p[idx].tail = '' p[idx].tail += a.text if a.text else '' p[idx].tail += a.tail if a.tail else '' super(KEPUBInput, self).postprocess_book(oeb, opts, log) if not opts.strip_kobo_spans: log("KEPUBInput::postprocess_book - not stripping kobo spans") return for item in oeb.spine: log("item.__class__.__name__", item.__class__.__name__) if not hasattr(item.data, 'xpath'): continue for a in item.data.xpath('//h:span[@class="koboSpan"]', namespaces={'h': XHTML_NS}): refactor_span(a) log("KEPUBInput::postprocess_book - end") def process_encryption(self, encfile, opf, log): # Shouldn't get called, but overriding just in case. encfile = os.path.abspath('rights.xml') return not os.path.exists(encfile)
class HTMLZOutput(OutputFormatPlugin): name = 'HTMLZ Output' author = 'John Schember' file_type = 'htmlz' commit_name = 'htmlz_output' ui_data = { 'css_choices': { 'class': _('Use CSS classes'), 'inline': _('Use the style attribute'), 'tag': _('Use HTML tags wherever possible') }, 'sheet_choices': { 'external': _('Use an external CSS file'), 'inline': _('Use a <style> tag in the HTML file') } } options = { OptionRecommendation( name='htmlz_css_type', recommended_value='class', level=OptionRecommendation.LOW, choices=list(ui_data['css_choices']), help=_('Specify the handling of CSS. Default is class.\n' 'class: {class}\n' 'inline: {inline}\n' 'tag: {tag}').format(**ui_data['css_choices'])), OptionRecommendation( name='htmlz_class_style', recommended_value='external', level=OptionRecommendation.LOW, choices=list(ui_data['sheet_choices']), help=_('How to handle the CSS when using css-type = \'class\'.\n' 'Default is external.\n' 'external: {external}\n' 'inline: {inline}').format(**ui_data['sheet_choices'])), OptionRecommendation( name='htmlz_title_filename', recommended_value=False, level=OptionRecommendation.LOW, help=_( 'If set this option causes the file name of the HTML file' ' inside the HTMLZ archive to be based on the book title.')), } def convert(self, oeb_book, output_path, input_plugin, opts, log): from lxml import etree from calibre.ebooks.oeb.base import OEB_IMAGES, SVG_MIME from calibre.ebooks.metadata.opf2 import OPF, metadata_to_opf from calibre.utils.zipfile import ZipFile from calibre.utils.filenames import ascii_filename # HTML if opts.htmlz_css_type == 'inline': from calibre.ebooks.htmlz.oeb2html import OEB2HTMLInlineCSSizer OEB2HTMLizer = OEB2HTMLInlineCSSizer elif opts.htmlz_css_type == 'tag': from calibre.ebooks.htmlz.oeb2html import OEB2HTMLNoCSSizer OEB2HTMLizer = OEB2HTMLNoCSSizer else: from calibre.ebooks.htmlz.oeb2html import OEB2HTMLClassCSSizer as OEB2HTMLizer with TemporaryDirectory(u'_htmlz_output') as tdir: htmlizer = OEB2HTMLizer(log) html = htmlizer.oeb2html(oeb_book, opts) fname = u'index' if opts.htmlz_title_filename: from calibre.utils.filenames import shorten_components_to fname = shorten_components_to(100, (ascii_filename( unicode_type(oeb_book.metadata.title[0])), ))[0] with open(os.path.join(tdir, fname + u'.html'), 'wb') as tf: if isinstance(html, unicode_type): html = html.encode('utf-8') tf.write(html) # CSS if opts.htmlz_css_type == 'class' and opts.htmlz_class_style == 'external': with open(os.path.join(tdir, u'style.css'), 'wb') as tf: tf.write(htmlizer.get_css(oeb_book)) # Images images = htmlizer.images if images: if not os.path.exists(os.path.join(tdir, u'images')): os.makedirs(os.path.join(tdir, u'images')) for item in oeb_book.manifest: if item.media_type in OEB_IMAGES and item.href in images: if item.media_type == SVG_MIME: data = unicode_type( etree.tostring(item.data, encoding=unicode_type)) else: data = item.data fname = os.path.join(tdir, u'images', images[item.href]) with open(fname, 'wb') as img: img.write(data) # Cover cover_path = None try: cover_data = None if oeb_book.metadata.cover: term = oeb_book.metadata.cover[0].term cover_data = oeb_book.guide[term].item.data if cover_data: from calibre.utils.img import save_cover_data_to cover_path = os.path.join(tdir, u'cover.jpg') with lopen(cover_path, 'w') as cf: cf.write('') save_cover_data_to(cover_data, cover_path) except: import traceback traceback.print_exc() # Metadata with open(os.path.join(tdir, u'metadata.opf'), 'wb') as mdataf: opf = OPF( io.BytesIO( etree.tostring(oeb_book.metadata.to_opf1(), encoding='UTF-8'))) mi = opf.to_book_metadata() if cover_path: mi.cover = u'cover.jpg' mdataf.write(metadata_to_opf(mi)) htmlz = ZipFile(output_path, 'w') htmlz.add_dir(tdir)
class TXTOutput(OutputFormatPlugin): name = 'TXT Output' author = 'John Schember' file_type = 'txt' commit_name = 'txt_output' ui_data = { 'newline_types': NEWLINE_TYPES, 'formatting_types': { 'plain': _('Plain text'), 'markdown': _('Markdown formatted text'), 'textile': _('TexTile formatted text') }, } options = { OptionRecommendation( name='newline', recommended_value='system', level=OptionRecommendation.LOW, short_switch='n', choices=NEWLINE_TYPES, help= _('Type of newline to use. Options are %s. Default is \'system\'. ' 'Use \'old_mac\' for compatibility with Mac OS 9 and earlier. ' 'For macOS use \'unix\'. \'system\' will default to the newline ' 'type used by this OS.') % sorted(NEWLINE_TYPES)), OptionRecommendation( name='txt_output_encoding', recommended_value='utf-8', level=OptionRecommendation.LOW, help=_('Specify the character encoding of the output document. ' 'The default is utf-8.')), OptionRecommendation( name='inline_toc', recommended_value=False, level=OptionRecommendation.LOW, help=_('Add Table of Contents to beginning of the book.')), OptionRecommendation( name='max_line_length', recommended_value=0, level=OptionRecommendation.LOW, help= _('The maximum number of characters per line. This splits on ' 'the first space before the specified value. If no space is found ' 'the line will be broken at the space after and will exceed the ' 'specified value. Also, there is a minimum of 25 characters. ' 'Use 0 to disable line splitting.')), OptionRecommendation( name='force_max_line_length', recommended_value=False, level=OptionRecommendation.LOW, help=_( 'Force splitting on the max-line-length value when no space ' 'is present. Also allows max-line-length to be below the minimum' )), OptionRecommendation(name='txt_output_formatting', recommended_value='plain', choices=list(ui_data['formatting_types']), help=_('Formatting used within the document.\n' '* plain: {plain}\n' '* markdown: {markdown}\n' '* textile: {textile}').format( **ui_data['formatting_types'])), OptionRecommendation( name='keep_links', recommended_value=False, level=OptionRecommendation.LOW, help=_( 'Do not remove links within the document. This is only ' 'useful when paired with a TXT output formatting option that ' 'is not none because links are always removed with plain text output.' )), OptionRecommendation( name='keep_image_references', recommended_value=False, level=OptionRecommendation.LOW, help= _('Do not remove image references within the document. This is only ' 'useful when paired with a TXT output formatting option that ' 'is not none because links are always removed with plain text output.' )), OptionRecommendation( name='keep_color', recommended_value=False, level=OptionRecommendation.LOW, help= _('Do not remove font color from output. This is only useful when ' 'TXT output formatting is set to textile. Textile is the only ' 'formatting that supports setting font color. If this option is ' 'not specified font color will not be set and default to the ' 'color displayed by the reader (generally this is black).')), } def convert(self, oeb_book, output_path, input_plugin, opts, log): from calibre.ebooks.txt.txtml import TXTMLizer from calibre.utils.cleantext import clean_ascii_chars from calibre.ebooks.txt.newlines import specified_newlines, TxtNewlines if opts.txt_output_formatting.lower() == 'markdown': from calibre.ebooks.txt.markdownml import MarkdownMLizer self.writer = MarkdownMLizer(log) elif opts.txt_output_formatting.lower() == 'textile': from calibre.ebooks.txt.textileml import TextileMLizer self.writer = TextileMLizer(log) else: self.writer = TXTMLizer(log) txt = self.writer.extract_content(oeb_book, opts) txt = clean_ascii_chars(txt) log.debug('\tReplacing newlines with selected type...') txt = specified_newlines(TxtNewlines(opts.newline).newline, txt) close = False if not hasattr(output_path, 'write'): close = True if not os.path.exists(os.path.dirname( output_path)) and os.path.dirname(output_path) != '': os.makedirs(os.path.dirname(output_path)) out_stream = open(output_path, 'wb') else: out_stream = output_path out_stream.seek(0) out_stream.truncate() out_stream.write(txt.encode(opts.txt_output_encoding, 'replace')) if close: out_stream.close()
class HTMLOutput(OutputFormatPlugin): name = 'HTML Output' author = 'Fabian Grassl' file_type = 'zip' commit_name = 'html_output' options = { OptionRecommendation( name='template_css', help=_( 'CSS file used for the output instead of the default file')), OptionRecommendation( name='template_html_index', help= _('Template used for generation of the HTML index file instead of the default file' )), OptionRecommendation( name='template_html', help= _('Template used for the generation of the HTML contents of the book instead of the default file' )), OptionRecommendation( name='extract_to', help=_( 'Extract the contents of the generated ZIP file to the ' 'specified directory. WARNING: The contents of the directory ' 'will be deleted.')), } recommendations = {('pretty_print', True, OptionRecommendation.HIGH)} def generate_toc(self, oeb_book, ref_url, output_dir): ''' Generate table of contents ''' from lxml import etree from urllib import unquote from calibre.ebooks.oeb.base import element from calibre.utils.cleantext import clean_xml_chars with CurrentDir(output_dir): def build_node(current_node, parent=None): if parent is None: parent = etree.Element('ul') elif len(current_node.nodes): parent = element(parent, ('ul')) for node in current_node.nodes: point = element(parent, 'li') href = relpath(abspath(unquote(node.href)), dirname(ref_url)) if isinstance(href, bytes): href = href.decode('utf-8') link = element(point, 'a', href=clean_xml_chars(href)) title = node.title if isinstance(title, bytes): title = title.decode('utf-8') if title: title = re.sub(r'\s+', ' ', title) link.text = clean_xml_chars(title) build_node(node, point) return parent wrap = etree.Element('div') wrap.append(build_node(oeb_book.toc)) return wrap def generate_html_toc(self, oeb_book, ref_url, output_dir): from lxml import etree root = self.generate_toc(oeb_book, ref_url, output_dir) return etree.tostring(root, pretty_print=True, encoding='utf-8', xml_declaration=False) def convert(self, oeb_book, output_path, input_plugin, opts, log): from lxml import etree from calibre.utils import zipfile from templite import Templite from urllib import unquote from calibre.ebooks.html.meta import EasyMeta # read template files if opts.template_html_index is not None: template_html_index_data = open(opts.template_html_index, 'rb').read() else: template_html_index_data = P( 'templates/html_export_default_index.tmpl', data=True) if opts.template_html is not None: template_html_data = open(opts.template_html, 'rb').read() else: template_html_data = P('templates/html_export_default.tmpl', data=True) if opts.template_css is not None: template_css_data = open(opts.template_css, 'rb').read() else: template_css_data = P('templates/html_export_default.css', data=True) template_html_index_data = template_html_index_data.decode('utf-8') template_html_data = template_html_data.decode('utf-8') template_css_data = template_css_data.decode('utf-8') self.log = log self.opts = opts meta = EasyMeta(oeb_book.metadata) tempdir = os.path.realpath(PersistentTemporaryDirectory()) output_file = os.path.join( tempdir, basename(re.sub(r'\.zip', '', output_path) + '.html')) output_dir = re.sub(r'\.html', '', output_file) + '_files' if not exists(output_dir): os.makedirs(output_dir) css_path = output_dir + os.sep + 'calibreHtmlOutBasicCss.css' with open(css_path, 'wb') as f: f.write(template_css_data.encode('utf-8')) with open(output_file, 'wb') as f: html_toc = self.generate_html_toc(oeb_book, output_file, output_dir) templite = Templite(template_html_index_data) nextLink = oeb_book.spine[0].href nextLink = relpath(output_dir + os.sep + nextLink, dirname(output_file)) cssLink = relpath(abspath(css_path), dirname(output_file)) tocUrl = relpath(output_file, dirname(output_file)) t = templite.render(has_toc=bool(oeb_book.toc.count()), toc=html_toc, meta=meta, nextLink=nextLink, tocUrl=tocUrl, cssLink=cssLink, firstContentPageLink=nextLink) if isinstance(t, unicode_type): t = t.encode('utf-8') f.write(t) with CurrentDir(output_dir): for item in oeb_book.manifest: path = abspath(unquote(item.href)) dir = dirname(path) if not exists(dir): os.makedirs(dir) if item.spine_position is not None: with open(path, 'wb') as f: pass else: with open(path, 'wb') as f: f.write(str(item)) item.unload_data_from_memory(memory=path) for item in oeb_book.spine: path = abspath(unquote(item.href)) dir = dirname(path) root = item.data.getroottree() # get & clean HTML <HEAD>-data head = root.xpath( '//h:head', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0] head_content = etree.tostring(head, pretty_print=True, encoding='utf-8') head_content = re.sub(r'\<\/?head.*\>', '', head_content) head_content = re.sub( re.compile(r'\<style.*\/style\>', re.M | re.S), '', head_content) head_content = re.sub(r'<(title)([^>]*)/>', r'<\1\2></\1>', head_content) # get & clean HTML <BODY>-data body = root.xpath( '//h:body', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0] ebook_content = etree.tostring(body, pretty_print=True, encoding='utf-8') ebook_content = re.sub(r'\<\/?body.*\>', '', ebook_content) ebook_content = re.sub(r'<(div|a|span)([^>]*)/>', r'<\1\2></\1>', ebook_content) # generate link to next page if item.spine_position + 1 < len(oeb_book.spine): nextLink = oeb_book.spine[item.spine_position + 1].href nextLink = relpath(abspath(nextLink), dir) else: nextLink = None # generate link to previous page if item.spine_position > 0: prevLink = oeb_book.spine[item.spine_position - 1].href prevLink = relpath(abspath(prevLink), dir) else: prevLink = None cssLink = relpath(abspath(css_path), dir) tocUrl = relpath(output_file, dir) firstContentPageLink = oeb_book.spine[0].href # render template templite = Templite(template_html_data) toc = lambda: self.generate_html_toc(oeb_book, path, output_dir ) t = templite.render(ebookContent=ebook_content, prevLink=prevLink, nextLink=nextLink, has_toc=bool(oeb_book.toc.count()), toc=toc, tocUrl=tocUrl, head_content=head_content, meta=meta, cssLink=cssLink, firstContentPageLink=firstContentPageLink) # write html to file with open(path, 'wb') as f: f.write(t) item.unload_data_from_memory(memory=path) zfile = zipfile.ZipFile(output_path, "w") zfile.add_dir(output_dir, basename(output_dir)) zfile.write(output_file, basename(output_file), zipfile.ZIP_DEFLATED) if opts.extract_to: if os.path.exists(opts.extract_to): shutil.rmtree(opts.extract_to) os.makedirs(opts.extract_to) zfile.extractall(opts.extract_to) self.log('Zip file extracted to', opts.extract_to) zfile.close() # cleanup temp dir shutil.rmtree(tempdir)
class KEPubOutput(OutputFormatPlugin): name = 'KePub Output' author = 'Joel Goguen' file_type = 'kepub' version = plugin_version minimum_calibre_version = plugin_minimum_calibre_version epub_output_plugin = None configdir = os.path.join(config_dir, 'plugins') reference_kepub = os.path.join(configdir, 'reference.kepub.epub') options = set([ OptionRecommendation( name='kepub_hyphenate', recommended_value=True, help= 'Select this to add a CSS file which enables hyphenation. The language used will be the language defined for the book in calibre. Please see the README file for directions on updating hyphenation dictionaries.' ), OptionRecommendation( name='kepub_replace_lang', recommended_value=True, help= 'Select this to replace the defined language in each content file inside the ePub.' ), OptionRecommendation( name='kepub_clean_markup', recommended_value=True, help='Select this to clean up the internal ePub markup.') ]) recommendations = set([]) def __init__(self, *args, **kwargs): self.epub_output_plugin = EPUBOutput(*args, **kwargs) self.options = self.options.union(self.epub_output_plugin.options) self.recommendations = self.recommendations.union( self.epub_output_plugin.recommendations) OutputFormatPlugin.__init__(self, *args, **kwargs) def gui_configuration_widget(self, parent, get_option_by_name, get_option_help, db, book_id=None): from calibre_plugins.koboconversion.conversion.config import PluginWidget return PluginWidget(parent, get_option_by_name, get_option_help, db, book_id) def convert(self, oeb_book, output, input_plugin, opts, log): self.epub_output_plugin.convert(oeb_book, output, input_plugin, opts, log) container = KEPubContainer(output, default_log) if container.is_drm_encumbered: return # Write the details file o = { 'kepub_output_version': ".".join([str(n) for n in self.version]), 'kepub_output_currenttime': datetime.utcnow().ctime() } kte_data_file = self.temporary_file('_KePubOutputPluginInfo') kte_data_file.write(json.dumps(o)) kte_data_file.close() container.copy_file_to_container(kte_data_file.name, name='plugininfo.kte', mt='application/json') title = container.opf_xpath("./opf:metadata/dc:title/text()") if len(title) > 0: title = title[0] else: title = NULL_VALUES['title'] authors = container.opf_xpath( './opf:metadata/dc:creator[@opf:role="aut"]/text()') if len(authors) < 1: authors = NULL_VALUES['authors'] mi = Metadata(title, authors) language = container.opf_xpath("./opf:metadata/dc:language/text()") if len(language) > 0: mi.languages = language language = language[0] else: mi.languages = NULL_VALUES['languages'] language = NULL_VALUES['language'] mi.language modify_epub(container, output, metadata=mi, opts={ 'clean_markup': opts.kepub_clean_markup, 'hyphenate': opts.kepub_hyphenate, 'replace_lang': opts.kepub_replace_lang, 'smarten_punctuation': False, 'extended_kepub_features': True })
class HTMLInput(InputFormatPlugin): name = 'HTML Input' author = 'Kovid Goyal' description = 'Convert HTML and OPF files to an OEB' file_types = {'opf', 'html', 'htm', 'xhtml', 'xhtm', 'shtm', 'shtml'} commit_name = 'html_input' options = { OptionRecommendation( name='breadth_first', recommended_value=False, level=OptionRecommendation.LOW, help=_('Traverse links in HTML files breadth first. Normally, ' 'they are traversed depth first.')), OptionRecommendation( name='max_levels', recommended_value=5, level=OptionRecommendation.LOW, help=_('Maximum levels of recursion when following links in ' 'HTML files. Must be non-negative. 0 implies that no ' 'links in the root HTML file are followed. Default is ' '%default.')), OptionRecommendation( name='dont_package', recommended_value=False, level=OptionRecommendation.LOW, help=_( 'Normally this input plugin re-arranges all the input ' 'files into a standard folder hierarchy. Only use this option ' 'if you know what you are doing as it can result in various ' 'nasty side effects in the rest of the conversion pipeline.')), } def convert(self, stream, opts, file_ext, log, accelerators): self._is_case_sensitive = None basedir = getcwd() self.opts = opts fname = None if hasattr(stream, 'name'): basedir = os.path.dirname(stream.name) fname = os.path.basename(stream.name) if file_ext != 'opf': if opts.dont_package: raise ValueError( 'The --dont-package option is not supported for an HTML input file' ) from calibre.ebooks.metadata.html import get_metadata mi = get_metadata(stream) if fname: from calibre.ebooks.metadata.meta import metadata_from_filename fmi = metadata_from_filename(fname) fmi.smart_update(mi) mi = fmi oeb = self.create_oebbook(stream.name, basedir, opts, log, mi) return oeb from calibre.ebooks.conversion.plumber import create_oebbook return create_oebbook(log, stream.name, opts, encoding=opts.input_encoding) def is_case_sensitive(self, path): if getattr(self, '_is_case_sensitive', None) is not None: return self._is_case_sensitive if not path or not os.path.exists(path): return islinux or isbsd self._is_case_sensitive = not (os.path.exists(path.lower()) and os.path.exists(path.upper())) return self._is_case_sensitive def create_oebbook(self, htmlpath, basedir, opts, log, mi): import uuid from calibre.ebooks.conversion.plumber import create_oebbook from calibre.ebooks.oeb.base import (DirContainer, rewrite_links, urlnormalize, urldefrag, BINARY_MIME, OEB_STYLES, xpath, urlquote) from calibre import guess_type from calibre.ebooks.oeb.transforms.metadata import \ meta_info_to_oeb_metadata from calibre.ebooks.html.input import get_filelist from calibre.ebooks.metadata import string_to_authors from calibre.utils.localization import canonicalize_lang import css_parser, logging css_parser.log.setLevel(logging.WARN) self.OEB_STYLES = OEB_STYLES oeb = create_oebbook(log, None, opts, self, encoding=opts.input_encoding, populate=False) self.oeb = oeb metadata = oeb.metadata meta_info_to_oeb_metadata(mi, metadata, log) if not metadata.language: l = canonicalize_lang(getattr(opts, 'language', None)) if not l: oeb.logger.warn('Language not specified') l = get_lang().replace('_', '-') metadata.add('language', l) if not metadata.creator: a = getattr(opts, 'authors', None) if a: a = string_to_authors(a) if not a: oeb.logger.warn('Creator not specified') a = [self.oeb.translate(__('Unknown'))] for aut in a: metadata.add('creator', aut) if not metadata.title: oeb.logger.warn('Title not specified') metadata.add('title', self.oeb.translate(__('Unknown'))) bookid = unicode_type(uuid.uuid4()) metadata.add('identifier', bookid, id='uuid_id', scheme='uuid') for ident in metadata.identifier: if 'id' in ident.attrib: self.oeb.uid = metadata.identifier[0] break filelist = get_filelist(htmlpath, basedir, opts, log) filelist = [f for f in filelist if not f.is_binary] htmlfile_map = {} for f in filelist: path = f.path oeb.container = DirContainer(os.path.dirname(path), log, ignore_opf=True) bname = os.path.basename(path) id, href = oeb.manifest.generate(id='html', href=sanitize_file_name(bname)) htmlfile_map[path] = href item = oeb.manifest.add(id, href, 'text/html') if path == htmlpath and '%' in path: bname = urlquote(bname) item.html_input_href = bname oeb.spine.add(item, True) self.added_resources = {} self.log = log self.log('Normalizing filename cases') for path, href in htmlfile_map.items(): if not self.is_case_sensitive(path): path = path.lower() self.added_resources[path] = href self.urlnormalize, self.DirContainer = urlnormalize, DirContainer self.urldefrag = urldefrag self.guess_type, self.BINARY_MIME = guess_type, BINARY_MIME self.log('Rewriting HTML links') for f in filelist: path = f.path dpath = os.path.dirname(path) oeb.container = DirContainer(dpath, log, ignore_opf=True) href = htmlfile_map[path] try: item = oeb.manifest.hrefs[href] except KeyError: item = oeb.manifest.hrefs[urlnormalize(href)] rewrite_links(item.data, partial(self.resource_adder, base=dpath)) for item in oeb.manifest.values(): if item.media_type in self.OEB_STYLES: dpath = None for path, href in self.added_resources.items(): if href == item.href: dpath = os.path.dirname(path) break css_parser.replaceUrls( item.data, partial(self.resource_adder, base=dpath)) toc = self.oeb.toc self.oeb.auto_generated_toc = True titles = [] headers = [] for item in self.oeb.spine: if not item.linear: continue html = item.data title = ''.join(xpath(html, '/h:html/h:head/h:title/text()')) title = re.sub(r'\s+', ' ', title.strip()) if title: titles.append(title) headers.append('(unlabled)') for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): expr = '/h:html/h:body//h:%s[position()=1]/text()' header = ''.join(xpath(html, expr % tag)) header = re.sub(r'\s+', ' ', header.strip()) if header: headers[-1] = header break use = titles if len(titles) > len(set(titles)): use = headers for title, item in zip(use, self.oeb.spine): if not item.linear: continue toc.add(title, item.href) oeb.container = DirContainer(getcwd(), oeb.log, ignore_opf=True) return oeb def link_to_local_path(self, link_, base=None): from calibre.ebooks.html.input import Link if not isinstance(link_, unicode_type): try: link_ = link_.decode('utf-8', 'error') except: self.log.warn('Failed to decode link %r. Ignoring' % link_) return None, None try: l = Link(link_, base if base else getcwd()) except: self.log.exception('Failed to process link: %r' % link_) return None, None if l.path is None: # Not a local resource return None, None link = l.path.replace('/', os.sep).strip() frag = l.fragment if not link: return None, None return link, frag def resource_adder(self, link_, base=None): from polyglot.urllib import quote link, frag = self.link_to_local_path(link_, base=base) if link is None: return link_ try: if base and not os.path.isabs(link): link = os.path.join(base, link) link = os.path.abspath(link) except: return link_ if not os.access(link, os.R_OK): return link_ if os.path.isdir(link): self.log.warn(link_, 'is a link to a directory. Ignoring.') return link_ if not self.is_case_sensitive(tempfile.gettempdir()): link = link.lower() if link not in self.added_resources: bhref = os.path.basename(link) id, href = self.oeb.manifest.generate( id='added', href=sanitize_file_name(bhref)) guessed = self.guess_type(href)[0] media_type = guessed or self.BINARY_MIME if media_type == 'text/plain': self.log.warn('Ignoring link to text file %r' % link_) return None if media_type == self.BINARY_MIME: # Check for the common case, images try: img = what(link) except EnvironmentError: pass else: if img: media_type = self.guess_type( 'dummy.' + img)[0] or self.BINARY_MIME self.oeb.log.debug('Added', link) self.oeb.container = self.DirContainer(os.path.dirname(link), self.oeb.log, ignore_opf=True) # Load into memory item = self.oeb.manifest.add(id, href, media_type) # bhref refers to an already existing file. The read() method of # DirContainer will call unquote on it before trying to read the # file, therefore we quote it here. if isinstance(bhref, unicode_type): bhref = bhref.encode('utf-8') item.html_input_href = as_unicode(quote(bhref)) if guessed in self.OEB_STYLES: item.override_css_fetch = partial(self.css_import_handler, os.path.dirname(link)) item.data self.added_resources[link] = href nlink = self.added_resources[link] if frag: nlink = '#'.join((nlink, frag)) return nlink def css_import_handler(self, base, href): link, frag = self.link_to_local_path(href, base=base) if link is None or not os.access(link, os.R_OK) or os.path.isdir(link): return None, None try: with open(link, 'rb') as f: raw = f.read().decode('utf-8', 'replace') raw = self.oeb.css_preprocessor(raw, add_namespace=False) except: self.log.exception('Failed to read CSS file: %r' % link) return None, None return None, raw
class RecipeInput(InputFormatPlugin): name = 'Recipe Input' author = 'Kovid Goyal' description = _('Download periodical content from the internet') file_types = {'recipe', 'downloaded_recipe'} commit_name = 'recipe_input' recommendations = { ('chapter', None, OptionRecommendation.HIGH), ('dont_split_on_page_breaks', True, OptionRecommendation.HIGH), ('use_auto_toc', False, OptionRecommendation.HIGH), ('input_encoding', None, OptionRecommendation.HIGH), ('input_profile', 'default', OptionRecommendation.HIGH), ('page_breaks_before', None, OptionRecommendation.HIGH), ('insert_metadata', False, OptionRecommendation.HIGH), } options = { OptionRecommendation( name='test', recommended_value=False, help= _('Useful for recipe development. Forces' ' max_articles_per_feed to 2 and downloads at most 2 feeds.' ' You can change the number of feeds and articles by supplying optional arguments.' ' For example: --test 3 1 will download at most 3 feeds and only 1 article per feed.' )), OptionRecommendation( name='username', recommended_value=None, help=_('Username for sites that require a login to access ' 'content.')), OptionRecommendation( name='password', recommended_value=None, help=_('Password for sites that require a login to access ' 'content.')), OptionRecommendation( name='dont_download_recipe', recommended_value=False, help= _('Do not download latest version of builtin recipes from the calibre server' )), OptionRecommendation( name='lrf', recommended_value=False, help='Optimize fetching for subsequent conversion to LRF.'), } def convert(self, recipe_or_file, opts, file_ext, log, accelerators): from calibre.web.feeds.recipes import compile_recipe opts.output_profile.flow_size = 0 if file_ext == 'downloaded_recipe': from calibre.utils.zipfile import ZipFile zf = ZipFile(recipe_or_file, 'r') zf.extractall() zf.close() with lopen('download.recipe', 'rb') as f: self.recipe_source = f.read() recipe = compile_recipe(self.recipe_source) recipe.needs_subscription = False self.recipe_object = recipe(opts, log, self.report_progress) else: if os.environ.get('CALIBRE_RECIPE_URN'): from calibre.web.feeds.recipes.collection import get_custom_recipe, get_builtin_recipe_by_id urn = os.environ['CALIBRE_RECIPE_URN'] log('Downloading recipe urn: ' + urn) rtype, recipe_id = urn.partition(':')[::2] if not recipe_id: raise ValueError('Invalid recipe urn: ' + urn) if rtype == 'custom': self.recipe_source = get_custom_recipe(recipe_id) else: self.recipe_source = get_builtin_recipe_by_id( urn, log=log, download_recipe=True) if not self.recipe_source: raise ValueError('Could not find recipe with urn: ' + urn) if not isinstance(self.recipe_source, bytes): self.recipe_source = self.recipe_source.encode('utf-8') recipe = compile_recipe(self.recipe_source) elif os.access(recipe_or_file, os.R_OK): with lopen(recipe_or_file, 'rb') as f: self.recipe_source = f.read() recipe = compile_recipe(self.recipe_source) log('Using custom recipe') else: from calibre.web.feeds.recipes.collection import ( get_builtin_recipe_by_title, get_builtin_recipe_titles) title = getattr(opts, 'original_recipe_input_arg', recipe_or_file) title = os.path.basename(title).rpartition('.')[0] titles = frozenset(get_builtin_recipe_titles()) if title not in titles: title = getattr(opts, 'original_recipe_input_arg', recipe_or_file) title = title.rpartition('.')[0] raw = get_builtin_recipe_by_title( title, log=log, download_recipe=not opts.dont_download_recipe) builtin = False try: recipe = compile_recipe(raw) self.recipe_source = raw if recipe.requires_version > numeric_version: log.warn( 'Downloaded recipe needs calibre version at least: %s' % ('.'.join(recipe.requires_version))) builtin = True except: log.exception( 'Failed to compile downloaded recipe. Falling ' 'back to builtin one') builtin = True if builtin: log('Using bundled builtin recipe') raw = get_builtin_recipe_by_title(title, log=log, download_recipe=False) if raw is None: raise ValueError('Failed to find builtin recipe: ' + title) recipe = compile_recipe(raw) self.recipe_source = raw else: log('Using downloaded builtin recipe') if recipe is None: raise ValueError( '%r is not a valid recipe file or builtin recipe' % recipe_or_file) disabled = getattr(recipe, 'recipe_disabled', None) if disabled is not None: raise RecipeDisabled(disabled) ro = recipe(opts, log, self.report_progress) ro.download() self.recipe_object = ro for key, val in self.recipe_object.conversion_options.items(): setattr(opts, key, val) for f in os.listdir('.'): if f.endswith('.opf'): return os.path.abspath(f) for f in walk('.'): if f.endswith('.opf'): return os.path.abspath(f) def postprocess_book(self, oeb, opts, log): if self.recipe_object is not None: self.recipe_object.internal_postprocess_book(oeb, opts, log) self.recipe_object.postprocess_book(oeb, opts, log) def specialize(self, oeb, opts, log, output_fmt): if opts.no_inline_navbars: from calibre.ebooks.oeb.base import XPath for item in oeb.spine: for div in XPath( '//h:div[contains(@class, "calibre_navbar")]')( item.data): div.getparent().remove(div) def save_download(self, zf): raw = self.recipe_source if isinstance(raw, unicode_type): raw = raw.encode('utf-8') zf.writestr('download.recipe', raw)