示例#1
0
    def convert(self, stream, options, file_ext, log, accelerators):
        from ebook_converter.ebooks.metadata.toc import TOC
        from ebook_converter.ebooks.metadata.opf2 import OPFCreator
        from ebook_converter.utils.zipfile import ZipFile

        self.options = options
        self.log = log
        pages, images = [], []
        toc = TOC()

        if file_ext == 'pmlz':
            log.debug('De-compressing content to temporary directory...')
            with TemporaryDirectory('_unpmlz') as tdir:
                zf = ZipFile(stream)
                zf.extractall(tdir)

                pmls = glob.glob(os.path.join(tdir, '*.pml'))
                for pml in pmls:
                    html_name = os.path.splitext(
                        os.path.basename(pml))[0] + '.html'
                    html_path = os.path.join(os.getcwd(), html_name)

                    pages.append(html_name)
                    log.debug('Processing PML item %s...', pml)
                    ttoc = self.process_pml(pml, html_path)
                    toc += ttoc
                images = self.get_images(stream, tdir, True)
        else:
            toc = self.process_pml(stream, 'index.html')
            pages.append('index.html')

            if hasattr(stream, 'name'):
                images = self.get_images(
                    stream, os.path.abspath(os.path.dirname(stream.name)))

        # We want pages to be orded alphabetically.
        pages.sort()

        manifest_items = []
        for item in pages + images:
            manifest_items.append((item, None))

        from ebook_converter.ebooks.metadata.meta import get_metadata
        log.debug('Reading metadata from input file...')
        mi = get_metadata(stream, 'pml')
        if 'images/cover.png' in images:
            mi.cover = 'images/cover.png'
        opf = OPFCreator(os.getcwd(), mi)
        log.debug('Generating manifest...')
        opf.create_manifest(manifest_items)
        opf.create_spine(pages)
        opf.set_toc(toc)
        with open('metadata.opf', 'wb') as opffile:
            with open('toc.ncx', 'wb') as tocfile:
                opf.render(opffile, tocfile, 'toc.ncx')

        return os.path.join(os.getcwd(), 'metadata.opf')
示例#2
0
    def run(self, archive):
        from ebook_converter.utils.zipfile import ZipFile
        is_rar = archive.lower().endswith('.rar')
        if is_rar:
            from ebook_converter.utils.unrar import extract_member, names
        else:
            zf = ZipFile(archive, 'r')

        if is_rar:
            fnames = list(names(archive))
        else:
            fnames = zf.namelist()

        def fname_ok(fname):
            bn = os.path.basename(fname).lower()
            if bn == 'thumbs.db':
                return False
            if '.' not in bn:
                return False
            if bn.rpartition('.')[-1] in {'diz', 'nfo'}:
                return False
            if '__MACOSX' in fname.split('/'):
                return False
            return True

        fnames = list(filter(fname_ok, fnames))
        if is_comic(fnames):
            ext = '.cbr' if is_rar else '.cbz'
            of = self.temporary_file('_archive_extract' + ext)
            with open(archive, 'rb') as f:
                of.write(f.read())
            of.close()
            return of.name
        if len(fnames) > 1 or not fnames:
            return archive
        fname = fnames[0]
        ext = os.path.splitext(fname)[1][1:]
        if ext.lower() not in {
                'lit', 'epub', 'mobi', 'prc', 'rtf', 'pdf', 'mp3', 'pdb',
                'azw', 'azw1', 'azw3', 'fb2', 'docx', 'doc', 'odt'
        }:
            return archive

        of = self.temporary_file('_archive_extract.' + ext)
        with closing(of):
            if is_rar:
                data = extract_member(archive, match=None, name=fname)[1]
                of.write(data)
            else:
                of.write(zf.read(fname))
        return of.name
示例#3
0
def get_fb2_data(stream):
    from ebook_converter.utils.zipfile import ZipFile, BadZipfile
    pos = stream.tell()
    try:
        zf = ZipFile(stream)
    except BadZipfile:
        stream.seek(pos)
        ans = stream.read()
        zip_file_name = None
    else:
        names = zf.namelist()
        names = [x for x in names if x.lower().endswith('.fb2')] or names
        zip_file_name = names[0]
        ans = zf.open(zip_file_name).read()
    return ans, zip_file_name
示例#4
0
    def extract(self, stream):
        self.tdir = PersistentTemporaryDirectory('docx_container')
        try:
            zf = ZipFile(stream)
            zf.extractall(self.tdir)
        except Exception:
            self.log.exception('DOCX appears to be invalid ZIP file, trying a'
                               ' more forgiving ZIP parser')
            from ebook_converter.utils.localunzip import extractall
            stream.seek(0)
            extractall(stream, self.tdir)

        self.names = {}
        for f in walk(self.tdir):
            name = os.path.relpath(f, self.tdir).replace(os.sep, '/')
            self.names[name] = f
示例#5
0
def set_metadata(stream, mi, apply_null=False, update_timestamp=False):
    stream.seek(0)
    raw, zip_file_name = get_fb2_data(stream)
    root = _get_fbroot(raw)
    ctx = Context(root)
    desc = ctx.get_or_create(root, 'description')
    ti = ctx.get_or_create(desc, 'title-info')

    indent = ti.text

    _set_comments(ti, mi, ctx)
    _set_series(ti, mi, ctx)
    _set_tags(ti, mi, ctx)
    _set_authors(ti, mi, ctx)
    _set_title(ti, mi, ctx)
    _set_cover(ti, mi, ctx)

    for child in ti:
        child.tail = indent

    # Apparently there exists FB2 reading software that chokes on the use of
    # single quotes in xml declaration. Sigh. See
    # https://www.mobileread.com/forums/showthread.php?p=2273184#post2273184
    raw = b'<?xml version="1.0" encoding="UTF-8"?>\n'
    raw += etree.tostring(root, method='xml', encoding='utf-8',
                          xml_declaration=False)

    stream.seek(0)
    stream.truncate()
    if zip_file_name:
        from ebook_converter.utils.zipfile import ZipFile
        with ZipFile(stream, 'w') as zf:
            zf.writestr(zip_file_name, raw)
    else:
        stream.write(raw)
示例#6
0
def set_metadata(stream, mi):

    with ZipFile(stream) as zf:
        raw = _set_metadata(zf.open('meta.xml').read(), mi)
        # print(raw.decode('utf-8'))

    stream.seek(os.SEEK_SET)
    safe_replace(stream, "meta.xml", io.BytesIO(raw))
示例#7
0
 def __enter__(self, *args):
     """
     Add this plugin to the python path so that it's contents become
     directly importable.  Useful when bundling large python libraries into
     the plugin. Use it like this::
         with plugin:
             import something
     """
     if self.plugin_path is not None:
         from ebook_converter.utils.zipfile import ZipFile
         zf = ZipFile(self.plugin_path)
         extensions = {x.rpartition('.')[-1].lower() for x in
                       zf.namelist()}
         zip_safe = True
         for ext in ('pyd', 'so', 'dll', 'dylib'):
             if ext in extensions:
                 zip_safe = False
                 break
         if zip_safe:
             sys.path.insert(0, self.plugin_path)
             self.sys_insertion_path = self.plugin_path
         else:
             from ebook_converter.ptempfile import TemporaryDirectory
             self._sys_insertion_tdir = TemporaryDirectory('plugin_unzip')
             self.sys_insertion_path = (self._sys_insertion_tdir.
                                        __enter__(*args))
             zf.extractall(self.sys_insertion_path)
             sys.path.insert(0, self.sys_insertion_path)
         zf.close()
示例#8
0
def initialize_container(path_to_container,
                         opf_name='metadata.opf',
                         extra_entries=[]):
    '''
    Create an empty EPUB document, with a default skeleton.
    '''
    rootfiles = ''
    for path, mimetype, _ in extra_entries:
        rootfiles += '<rootfile full-path="{0}" media-type="{1}"/>'.format(
            path, mimetype)
    CONTAINER = simple_container_xml(opf_name, rootfiles).encode('utf-8')
    zf = ZipFile(path_to_container, 'w')
    zf.writestr('mimetype', b'application/epub+zip', compression=ZIP_STORED)
    zf.writestr('META-INF/', b'', 0o755)
    zf.writestr('META-INF/container.xml', CONTAINER)
    for path, _, data in extra_entries:
        zf.writestr(path, data)
    return zf
示例#9
0
    def convert(self, oeb_book, output_path, input_plugin, opts, log):
        from ebook_converter.ebooks.pml.pmlml import PMLMLizer
        from ebook_converter.utils.zipfile import ZipFile

        with TemporaryDirectory('_pmlz_output') as tdir:
            pmlmlizer = PMLMLizer(log)
            pml = str(pmlmlizer.extract_content(oeb_book, opts))
            with open(os.path.join(tdir, 'index.pml'), 'wb') as out:
                out.write(pml.encode(opts.pml_output_encoding, 'replace'))

            img_path = os.path.join(tdir, 'index_img')
            if not os.path.exists(img_path):
                os.makedirs(img_path)
            self.write_images(oeb_book.manifest, pmlmlizer.image_hrefs,
                              img_path, opts)

            log.debug('Compressing output...')
            pmlz = ZipFile(output_path, 'w')
            pmlz.add_dir(tdir)
示例#10
0
def get_comic_metadata(stream, stream_type, series_index='volume'):
    comment = None
    if stream_type == 'cbz':
        from ebook_converter.utils.zipfile import ZipFile
        zf = ZipFile(stream)
        comment = zf.comment
    elif stream_type == 'cbr':
        from ebook_converter.utils.unrar import comment as get_comment
        comment = get_comment(stream)

    return parse_comic_comment(comment or b'{}', series_index=series_index)
示例#11
0
 def run(self, archive):
     from ebook_converter.utils.zipfile import ZipFile
     with ZipFile(archive, 'r') as zf:
         fnames = zf.namelist()
         candidates = [x for x in fnames if x.lower().endswith('.docx')]
         if not candidates:
             return archive
         of = self.temporary_file('_kpf_extract.docx')
         with closing(of):
             of.write(zf.read(candidates[0]))
     return of.name
示例#12
0
    def safe_replace(self,
                     name,
                     datastream,
                     extra_replacements={},
                     add_missing=False):
        from ebook_converter.utils.zipfile import ZipFile, ZipInfo
        replacements = {name: datastream}
        replacements.update(extra_replacements)
        names = frozenset(list(replacements.keys()))
        found = set()

        def rbytes(name):
            r = replacements[name]
            if not isinstance(r, bytes):
                r = r.read()
            return r

        with SpooledTemporaryFile(max_size=100 * 1024 * 1024) as temp:
            ztemp = ZipFile(temp, 'w')
            for offset, header in self.file_info.values():
                if header.filename in names:
                    zi = ZipInfo(header.filename)
                    zi.compress_type = header.compression_method
                    ztemp.writestr(zi, rbytes(header.filename))
                    found.add(header.filename)
                else:
                    ztemp.writestr(header.filename,
                                   self.read(header.filename, spool_size=0))
            if add_missing:
                for name in names - found:
                    ztemp.writestr(name, rbytes(name))
            ztemp.close()
            zipstream = self.stream
            temp.seek(0)
            zipstream.seek(0)
            zipstream.truncate()
            shutil.copyfileobj(temp, zipstream)
            zipstream.flush()
示例#13
0
    def convert(self, oeb_book, output_path, input_plugin, opts, log):
        from ebook_converter.ebooks.oeb.base import OEB_IMAGES
        from ebook_converter.utils.zipfile import ZipFile
        from lxml import etree

        with TemporaryDirectory('_txtz_output') as tdir:
            # TXT
            txt_name = 'index.txt'
            if opts.txt_output_formatting.lower() == 'textile':
                txt_name = 'index.text'
            with TemporaryFile(txt_name) as tf:
                TXTOutput.convert(self, oeb_book, tf, input_plugin, opts, log)
                shutil.copy(tf, os.path.join(tdir, txt_name))

            # Images
            for item in oeb_book.manifest:
                if item.media_type in OEB_IMAGES:
                    if hasattr(self.writer, 'images'):
                        path = os.path.join(tdir, 'images')
                        if item.href in self.writer.images:
                            href = self.writer.images[item.href]
                        else:
                            continue
                    else:
                        path = os.path.join(tdir, os.path.dirname(item.href))
                        href = os.path.basename(item.href)
                    if not os.path.exists(path):
                        os.makedirs(path)
                    with open(os.path.join(path, href), 'wb') as imgf:
                        imgf.write(item.data)

            # Metadata
            with open(os.path.join(tdir, 'metadata.opf'), 'wb') as mdataf:
                mdataf.write(etree.tostring(oeb_book.metadata.to_opf1()))

            txtz = ZipFile(output_path, 'w')
            txtz.add_dir(tdir)
示例#14
0
    def __call__(self, stream, odir, log):
        from ebook_converter.utils.zipfile import ZipFile
        from ebook_converter.ebooks.metadata.odt import get_metadata
        from ebook_converter.ebooks.metadata.opf2 import OPFCreator

        if not os.path.exists(odir):
            os.makedirs(odir)
        with directory.CurrentDir(odir):
            log.info('Extracting ODT file...')
            stream.seek(0)
            mi = get_metadata(stream, 'odt')
            if not mi.title:
                mi.title = 'Unknown'
            if not mi.authors:
                mi.authors = ['Unknown']
            self.filter_load(stream, mi, log)

            # NOTE(gryf): Here is a workaround for ODF2XHTML.xhtml() method,
            # which expects, that all lines are strings.
            html = ''.join([str(l) for l in self.lines])

            # A blanket img specification like this causes problems
            # with EPUB output as the containing element often has
            # an absolute height and width set that is larger than
            # the available screen real estate
            html = html.replace('img { width: 100%; height: 100%; }', '')
            # odf2xhtml creates empty title tag
            html = html.replace('<title></title>',
                                '<title>%s</title>' % (mi.title, ))
            try:
                html = self.fix_markup(html, log)
            except:
                log.exception('Failed to filter CSS, conversion may be slow')
            with open('index.xhtml', 'wb') as f:
                f.write(polyglot.as_bytes(html))
            zf = ZipFile(stream, 'r')
            self.extract_pictures(zf)
            opf = OPFCreator(os.path.abspath(os.getcwd()), mi)
            opf.create_manifest([(os.path.abspath(os.path.join(r, f2)), None)
                                 for r, _, fnames in os.walk(os.getcwd())
                                 for f2 in fnames])
            opf.create_spine([os.path.abspath('index.xhtml')])
            with open('metadata.opf', 'wb') as f:
                opf.render(f)
            return os.path.abspath('metadata.opf')
示例#15
0
def get_metadata(stream, extract_cover=True):
    whitespace = re.compile(r'\s+')

    def normalize(s):
        return whitespace.sub(' ', s).strip()

    with ZipFile(stream) as zf:
        meta = zf.read('meta.xml')
        root = fromstring(meta)

        def find(field):
            ns, tag = fields[field]
            ans = root.xpath('//ns0:{}'.format(tag), namespaces={'ns0': ns})
            if ans:
                return normalize(
                    tostring(ans[0],
                             method='text',
                             encoding='unicode',
                             with_tail=False)).strip()

        mi = MetaInformation(None, [])
        title = find('title')
        if title:
            mi.title = title
        creator = find('initial-creator') or find('creator')
        if creator:
            mi.authors = string_to_authors(creator)
        desc = find('description')
        if desc:
            mi.comments = desc
        lang = find('language')
        if lang and canonicalize_lang(lang):
            mi.languages = [canonicalize_lang(lang)]
        kw = find('keyword') or find('keywords')
        if kw:
            mi.tags = [x.strip() for x in kw.split(',') if x.strip()]
        data = {}
        for tag in root.xpath('//ns0:user-defined',
                              namespaces={'ns0': fields['user-defined'][0]}):
            name = (tag.get('{%s}name' % METANS) or '').lower()
            vtype = tag.get('{%s}value-type' % METANS) or 'string'
            val = tag.text
            if name and val:
                if vtype == 'boolean':
                    val = val == 'true'
                data[name] = val
        opfmeta = False  # we need this later for the cover
        opfnocover = False
        if data.get('opf.metadata'):
            # custom metadata contains OPF information
            opfmeta = True
            if data.get('opf.titlesort', ''):
                mi.title_sort = data['opf.titlesort']
            if data.get('opf.authors', ''):
                mi.authors = string_to_authors(data['opf.authors'])
            if data.get('opf.authorsort', ''):
                mi.author_sort = data['opf.authorsort']
            if data.get('opf.isbn', ''):
                isbn = check_isbn(data['opf.isbn'])
                if isbn is not None:
                    mi.isbn = isbn
            if data.get('opf.publisher', ''):
                mi.publisher = data['opf.publisher']
            if data.get('opf.pubdate', ''):
                mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True)
            if data.get('opf.identifiers'):
                try:
                    mi.identifiers = json.loads(data['opf.identifiers'])
                except Exception:
                    pass
            if data.get('opf.rating'):
                try:
                    mi.rating = max(0, min(float(data['opf.rating']), 10))
                except Exception:
                    pass
            if data.get('opf.series', ''):
                mi.series = data['opf.series']
                if data.get('opf.seriesindex', ''):
                    try:
                        mi.series_index = float(data['opf.seriesindex'])
                    except Exception:
                        mi.series_index = 1.0
            if data.get('opf.language', ''):
                cl = canonicalize_lang(data['opf.language'])
                if cl:
                    mi.languages = [cl]
            opfnocover = data.get('opf.nocover', False)
        if not opfnocover:
            try:
                read_cover(stream, zf, mi, opfmeta, extract_cover)
            except Exception:
                pass  # Do not let an error reading the cover prevent reading other data

    return mi
示例#16
0
def do_dump(path, dest):
    if os.path.exists(dest):
        shutil.rmtree(dest)
    with ZipFile(path) as zf:
        zf.extractall(dest)
    pretty_all_xml_in_dir(dest)
示例#17
0
class DOCX(object):
    def __init__(self, path_or_stream, log=None, extract=True):
        self.docx_is_transitional = True
        stream = path_or_stream
        if not hasattr(path_or_stream, 'read'):
            stream = open(path_or_stream, 'rb')
        self.name = getattr(stream, 'name', None) or '<stream>'
        self.log = log or default_log
        if extract:
            self.extract(stream)
        else:
            self.init_zipfile(stream)
        self.read_content_types()
        self.read_package_relationships()
        self.namespace = DOCXNamespace(self.docx_is_transitional)

    def init_zipfile(self, stream):
        self.zipf = ZipFile(stream)
        self.names = frozenset(self.zipf.namelist())

    def extract(self, stream):
        self.tdir = PersistentTemporaryDirectory('docx_container')
        try:
            zf = ZipFile(stream)
            zf.extractall(self.tdir)
        except Exception:
            self.log.exception('DOCX appears to be invalid ZIP file, trying a'
                               ' more forgiving ZIP parser')
            from ebook_converter.utils.localunzip import extractall
            stream.seek(0)
            extractall(stream, self.tdir)

        self.names = {}
        for f in walk(self.tdir):
            name = os.path.relpath(f, self.tdir).replace(os.sep, '/')
            self.names[name] = f

    def exists(self, name):
        return name in self.names

    def read(self, name):
        if hasattr(self, 'zipf'):
            return self.zipf.open(name).read()
        path = self.names[name]
        with open(path, 'rb') as f:
            return f.read()

    def read_content_types(self):
        try:
            raw = self.read('[Content_Types].xml')
        except KeyError:
            raise InvalidDOCX('The file %s docx file has no '
                              '[Content_Types].xml' % self.name)
        root = etree.fromstring(raw)
        self.content_types = {}
        self.default_content_types = {}
        for item in root.xpath('//*[local-name()="Types"]/*[local-name()='
                               '"Default" and @Extension and @ContentType]'):
            self.default_content_types[item.get('Extension').lower()] = \
                    item.get('ContentType')
        for item in root.xpath('//*[local-name()="Types"]/*[local-name()='
                               '"Override" and @PartName and @ContentType]'):
            name = item.get('PartName').lstrip('/')
            self.content_types[name] = item.get('ContentType')

    def content_type(self, name):
        if name in self.content_types:
            return self.content_types[name]
        ext = name.rpartition('.')[-1].lower()
        if ext in self.default_content_types:
            return self.default_content_types[ext]
        return mimetypes.guess_type(name)[0]

    def read_package_relationships(self):
        try:
            raw = self.read('_rels/.rels')
        except KeyError:
            raise InvalidDOCX('The file %s docx file has no _rels/.rels' %
                              self.name)
        root = etree.fromstring(raw)
        self.relationships = {}
        self.relationships_rmap = {}
        for item in root.xpath('//*[local-name()="Relationships"]/*[local-name'
                               '()="Relationship" and @Type and @Target]'):
            target = item.get('Target').lstrip('/')
            typ = item.get('Type')
            if target == 'word/document.xml':
                self.docx_is_transitional = (typ != 'http://purl.oclc.org/'
                                             'ooxml/officeDocument/'
                                             'relationships/officeDocument')
            self.relationships[typ] = target
            self.relationships_rmap[target] = typ

    @property
    def document_name(self):
        name = self.relationships.get(self.namespace.names['DOCUMENT'], None)
        if name is None:
            names = tuple(
                n for n in self.names
                if n == 'document.xml' or n.endswith('/document.xml'))
            if not names:
                raise InvalidDOCX('The file %s docx file has no main '
                                  'document' % self.name)
            name = names[0]
        return name

    @property
    def document(self):
        return etree.fromstring(self.read(self.document_name))

    @property
    def document_relationships(self):
        return self.get_relationships(self.document_name)

    def get_relationships(self, name):
        base = '/'.join(name.split('/')[:-1])
        by_id, by_type = {}, {}
        parts = name.split('/')
        name = '/'.join(parts[:-1] + ['_rels', parts[-1] + '.rels'])
        try:
            raw = self.read(name)
        except KeyError:
            pass
        else:
            root = etree.fromstring(raw)
            for item in root.xpath('//*[local-name()="Relationships"]/*'
                                   '[local-name()="Relationship" and @Type '
                                   'and @Target]'):
                target = item.get('Target')
                if (item.get('TargetMode', None) != 'External'
                        and not target.startswith('#')):
                    target = '/'.join((base, target.lstrip('/')))
                typ = item.get('Type')
                Id = item.get('Id')
                by_id[Id] = by_type[typ] = target

        return by_id, by_type

    def get_document_properties_names(self):
        name = self.relationships.get(self.namespace.names['DOCPROPS'], None)
        if name is None:
            names = tuple(n for n in self.names
                          if n.lower() == 'docprops/core.xml')
            if names:
                name = names[0]
        yield name
        name = self.relationships.get(self.namespace.names['APPPROPS'], None)
        if name is None:
            names = tuple(n for n in self.names
                          if n.lower() == 'docprops/app.xml')
            if names:
                name = names[0]
        yield name

    @property
    def metadata(self):
        mi = Metadata('Unknown')
        dp_name, ap_name = self.get_document_properties_names()
        if dp_name:
            try:
                raw = self.read(dp_name)
            except KeyError:
                pass
            else:
                read_doc_props(raw, mi, self.namespace.XPath)
        if mi.is_null('language'):
            try:
                raw = self.read('word/styles.xml')
            except KeyError:
                pass
            else:
                read_default_style_language(raw, mi, self.namespace.XPath)

        ap_name = self.relationships.get(self.namespace.names['APPPROPS'],
                                         None)
        if ap_name:
            try:
                raw = self.read(ap_name)
            except KeyError:
                pass
            else:
                read_app_props(raw, mi)

        return mi

    def close(self):
        if hasattr(self, 'zipf'):
            self.zipf.close()
        else:
            try:
                shutil.rmtree(self.tdir)
            except EnvironmentError:
                pass
示例#18
0
    def convert(self, stream, options, file_ext, log, accelerators):
        from ebook_converter.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
        from ebook_converter.ebooks.chardet import detect
        from ebook_converter.utils.zipfile import ZipFile
        from ebook_converter.ebooks.txt.processor import (
            convert_basic, convert_markdown_with_metadata,
            separate_paragraphs_single_line,
            separate_paragraphs_print_formatted, preserve_spaces,
            detect_paragraph_type, detect_formatting_type,
            normalize_line_endings, convert_textile, remove_indents,
            block_to_single_line, separate_hard_scene_breaks)

        self.log = log
        txt = b''
        log.debug('Reading text from file...')
        length = 0
        base_dir = self.output_dir = os.getcwd()

        # Extract content from zip archive.
        if file_ext == 'txtz':
            zf = ZipFile(stream)
            zf.extractall('.')

            for root, _, fnames in os.walk('.'):
                for x in fnames:
                    x = os.path.join(root, x)
                    if os.path.splitext(x)[1].lower() in ('.txt', '.text'):
                        with open(x, 'rb') as tf:
                            txt += tf.read() + b'\n\n'
        else:
            if getattr(stream, 'name', None):
                base_dir = os.path.dirname(stream.name)
            txt = stream.read()
            if file_ext in {'md', 'textile', 'markdown'}:
                options.formatting_type = {
                    'md': 'markdown'
                }.get(file_ext, file_ext)
                log.info(
                    'File extension indicates particular formatting. '
                    'Forcing formatting type to: %s', options.formatting_type)
                options.paragraph_type = 'off'

        # Get the encoding of the document.
        if options.input_encoding:
            ienc = options.input_encoding
            log.debug('Using user specified input encoding of %s', ienc)
        else:
            det_encoding = detect(txt[:4096])
            det_encoding, confidence = det_encoding['encoding'], det_encoding[
                'confidence']
            if det_encoding and det_encoding.lower().replace(
                    '_',
                    '-').strip() in ('gb2312', 'chinese', 'csiso58gb231280',
                                     'euc-cn', 'euccn', 'eucgb2312-cn',
                                     'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
                # Microsoft Word exports to HTML with encoding incorrectly set to
                # gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
                det_encoding = 'gbk'
            ienc = det_encoding
            log.debug(
                'Detected input encoding as %s with a confidence of '
                '%s%%', ienc, confidence * 100)
        if not ienc:
            ienc = 'utf-8'
            log.debug(
                'No input encoding specified and could not auto detect '
                'using %s', ienc)
        # Remove BOM from start of txt as its presence can confuse markdown
        import codecs
        for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8,
                    codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE):
            if txt.startswith(bom):
                txt = txt[len(bom):]
                break
        txt = txt.decode(ienc, 'replace')

        # Replace entities
        txt = entities.ENT_PAT.sub(entities.xml_entity_to_unicode, txt)

        # Normalize line endings
        txt = normalize_line_endings(txt)

        # Determine the paragraph type of the document.
        if options.paragraph_type == 'auto':
            options.paragraph_type = detect_paragraph_type(txt)
            if options.paragraph_type == 'unknown':
                log.debug('Could not reliably determine paragraph type using '
                          'block')
                options.paragraph_type = 'block'
            else:
                log.debug('Auto detected paragraph type as %s',
                          options.paragraph_type)

        # Detect formatting
        if options.formatting_type == 'auto':
            options.formatting_type = detect_formatting_type(txt)
            log.debug('Auto detected formatting as %s',
                      options.formatting_type)

        if options.formatting_type == 'heuristic':
            setattr(options, 'enable_heuristics', True)
            setattr(options, 'unwrap_lines', False)
            setattr(options, 'smarten_punctuation', True)

        # Reformat paragraphs to block formatting based on the detected type.
        # We don't check for block because the processor assumes block.
        # single and print at transformed to block for processing.
        if options.paragraph_type == 'single':
            txt = separate_paragraphs_single_line(txt)
        elif options.paragraph_type == 'print':
            txt = separate_hard_scene_breaks(txt)
            txt = separate_paragraphs_print_formatted(txt)
            txt = block_to_single_line(txt)
        elif options.paragraph_type == 'unformatted':
            from ebook_converter.ebooks.conversion.utils import HeuristicProcessor
            # unwrap lines based on punctuation
            docanalysis = DocAnalysis('txt', txt)
            length = docanalysis.line_length(.5)
            preprocessor = HeuristicProcessor(options,
                                              log=getattr(self, 'log', None))
            txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
            txt = separate_paragraphs_single_line(txt)
        elif options.paragraph_type == 'block':
            txt = separate_hard_scene_breaks(txt)
            txt = block_to_single_line(txt)

        if getattr(options, 'enable_heuristics', False) and getattr(
                options, 'dehyphenate', False):
            docanalysis = DocAnalysis('txt', txt)
            if not length:
                length = docanalysis.line_length(.5)
            dehyphenator = Dehyphenator(options.verbose, log=self.log)
            txt = dehyphenator(txt, 'txt', length)

        # User requested transformation on the text.
        if options.txt_in_remove_indents:
            txt = remove_indents(txt)

        # Preserve spaces will replace multiple spaces to a space
        # followed by the &nbsp; entity.
        if options.preserve_spaces:
            txt = preserve_spaces(txt)

        # Process the text using the appropriate text processor.
        self.shifted_files = []
        try:
            html = ''
            input_mi = None
            if options.formatting_type == 'markdown':
                log.debug('Running text through markdown conversion...')
                try:
                    input_mi, html = convert_markdown_with_metadata(
                        txt,
                        extensions=[
                            x.strip()
                            for x in options.markdown_extensions.split(',')
                            if x.strip()
                        ])
                except RuntimeError:
                    raise ValueError(
                        'This txt file has malformed markup, it cannot be'
                        ' converted by calibre. See https://daringfireball.net/projects/markdown/syntax'
                    )
                html = self.fix_resources(html, base_dir)
            elif options.formatting_type == 'textile':
                log.debug('Running text through textile conversion...')
                html = convert_textile(txt)
                html = self.fix_resources(html, base_dir)
            else:
                log.debug('Running text through basic conversion...')
                flow_size = getattr(options, 'flow_size', 0)
                html = convert_basic(txt, epub_split_size_kb=flow_size)

            # Run the HTMLized text through the html processing plugin.
            from ebook_converter.customize.ui import plugin_for_input_format
            html_input = plugin_for_input_format('html')
            for opt in html_input.options:
                setattr(options, opt.option.name, opt.recommended_value)
            options.input_encoding = 'utf-8'
            htmlfile = self.shift_file('index.html', html.encode('utf-8'))
            odi = options.debug_pipeline
            options.debug_pipeline = None
            # Generate oeb from html conversion.
            oeb = html_input.convert(open(htmlfile, 'rb'), options, 'html',
                                     log, {})
            options.debug_pipeline = odi
        finally:
            for x in self.shifted_files:
                os.remove(x)

        # Set metadata from file.
        if input_mi is None:
            from ebook_converter.customize.ui import get_file_type_metadata
            input_mi = get_file_type_metadata(stream, file_ext)
        from ebook_converter.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
        meta_info_to_oeb_metadata(input_mi, oeb.metadata, log)
        self.html_postprocess_title = input_mi.title

        return oeb
示例#19
0
    def convert(self, stream, options, file_ext, log, accelerators):
        from ebook_converter.ebooks.chardet import xml_to_unicode
        from ebook_converter.ebooks.metadata.opf2 import OPF
        from ebook_converter.utils.zipfile import ZipFile

        self.log = log
        html = u''
        top_levels = []

        # Extract content from zip archive.
        zf = ZipFile(stream)
        zf.extractall()

        # Find the HTML file in the archive. It needs to be
        # top level.
        index = u''
        multiple_html = False
        # Get a list of all top level files in the archive.
        for x in os.listdir(u'.'):
            if os.path.isfile(x):
                top_levels.append(x)
        # Try to find an index. file.
        for x in top_levels:
            if x.lower() in ('index.html', 'index.xhtml', 'index.htm'):
                index = x
                break
        # Look for multiple HTML files in the archive. We look at the
        # top level files only as only they matter in HTMLZ.
        for x in top_levels:
            if os.path.splitext(x)[1].lower() in ('.html', '.xhtml', '.htm'):
                # Set index to the first HTML file found if it's not
                # called index.
                if not index:
                    index = x
                else:
                    multiple_html = True
        # Warn the user if there multiple HTML file in the archive. HTMLZ
        # supports a single HTML file. A conversion with a multiple HTML file
        # HTMLZ archive probably won't turn out as the user expects. With
        # Multiple HTML files ZIP input should be used in place of HTMLZ.
        if multiple_html:
            log.warn('Multiple HTML files found in the archive. Only %s will '
                     'be used.' % index)

        if index:
            with open(index, 'rb') as tf:
                html = tf.read()
        else:
            raise Exception('No top level HTML file found.')

        if not html:
            raise Exception('Top level HTML file %s is empty' % index)

        # Encoding
        if options.input_encoding:
            ienc = options.input_encoding
        else:
            ienc = xml_to_unicode(html[:4096])[-1]
        html = html.decode(ienc, 'replace')

        # Run the HTML through the html processing plugin.
        from ebook_converter.customize.ui import plugin_for_input_format
        html_input = plugin_for_input_format('html')
        for opt in html_input.options:
            setattr(options, opt.option.name, opt.recommended_value)
        options.input_encoding = 'utf-8'
        base = os.getcwd()
        htmlfile = os.path.join(base, u'index.html')
        c = 0
        while os.path.exists(htmlfile):
            c += 1
            htmlfile = u'index%d.html' % c
        with open(htmlfile, 'wb') as f:
            f.write(html.encode('utf-8'))
        odi = options.debug_pipeline
        options.debug_pipeline = None
        # Generate oeb from html conversion.
        with open(htmlfile, 'rb') as f:
            oeb = html_input.convert(f, options, 'html', log, {})
        options.debug_pipeline = odi
        os.remove(htmlfile)

        # Set metadata from file.
        from ebook_converter.customize.ui import get_file_type_metadata
        from ebook_converter.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
        mi = get_file_type_metadata(stream, file_ext)
        meta_info_to_oeb_metadata(mi, oeb.metadata, log)

        # Get the cover path from the OPF.
        cover_path = None
        opf = None
        for x in top_levels:
            if os.path.splitext(x)[1].lower() == u'.opf':
                opf = x
                break
        if opf:
            opf = OPF(opf, basedir=os.getcwd())
            cover_path = opf.raster_cover or opf.cover
        # Set the cover.
        if cover_path:
            cdata = None
            with open(os.path.join(os.getcwd(), cover_path), 'rb') as cf:
                cdata = cf.read()
            cover_name = os.path.basename(cover_path)
            id, href = oeb.manifest.generate('cover', cover_name)
            oeb.manifest.add(id,
                             href,
                             mimetypes.guess_type(cover_name)[0],
                             data=cdata)
            oeb.guide.add('cover', 'Cover', href)

        return oeb
示例#20
0
 def init_zipfile(self, stream):
     self.zipf = ZipFile(stream)
     self.names = frozenset(self.zipf.namelist())
示例#21
0
    def convert(self, oeb_book, output_path, input_plugin, opts, log):
        from lxml import etree
        from ebook_converter.ebooks.oeb.base import OEB_IMAGES, SVG_MIME
        from ebook_converter.ebooks.metadata.opf2 import OPF, metadata_to_opf
        from ebook_converter.utils.zipfile import ZipFile
        from ebook_converter.utils.filenames import ascii_filename

        # HTML
        if opts.htmlz_css_type == 'inline':
            from ebook_converter.ebooks.htmlz.oeb2html import OEB2HTMLInlineCSSizer
            OEB2HTMLizer = OEB2HTMLInlineCSSizer
        elif opts.htmlz_css_type == 'tag':
            from ebook_converter.ebooks.htmlz.oeb2html import OEB2HTMLNoCSSizer
            OEB2HTMLizer = OEB2HTMLNoCSSizer
        else:
            from ebook_converter.ebooks.htmlz.oeb2html import OEB2HTMLClassCSSizer as OEB2HTMLizer

        with TemporaryDirectory(u'_htmlz_output') as tdir:
            htmlizer = OEB2HTMLizer(log)
            html = htmlizer.oeb2html(oeb_book, opts)

            fname = u'index'
            if opts.htmlz_title_filename:
                from ebook_converter.utils.filenames import shorten_components_to
                fname = shorten_components_to(100, (ascii_filename(str(oeb_book.metadata.title[0])),))[0]
            with open(os.path.join(tdir, fname+u'.html'), 'wb') as tf:
                if isinstance(html, str):
                    html = html.encode('utf-8')
                tf.write(html)

            # CSS
            if opts.htmlz_css_type == 'class' and opts.htmlz_class_style == 'external':
                with open(os.path.join(tdir, u'style.css'), 'wb') as tf:
                    tf.write(htmlizer.get_css(oeb_book))

            # Images
            images = htmlizer.images
            if images:
                if not os.path.exists(os.path.join(tdir, u'images')):
                    os.makedirs(os.path.join(tdir, u'images'))
                for item in oeb_book.manifest:
                    if item.media_type in OEB_IMAGES and item.href in images:
                        if item.media_type == SVG_MIME:
                            data = etree.tostring(item.data, encoding='unicode')
                        else:
                            data = item.data
                        fname = os.path.join(tdir, u'images', images[item.href])
                        with open(fname, 'wb') as img:
                            img.write(data)

            # Cover
            cover_path = None
            try:
                cover_data = None
                if oeb_book.metadata.cover:
                    term = oeb_book.metadata.cover[0].term
                    cover_data = oeb_book.guide[term].item.data
                if cover_data:
                    from ebook_converter.utils.img import save_cover_data_to
                    cover_path = os.path.join(tdir, u'cover.jpg')
                    with open(cover_path, 'w') as cf:
                        cf.write('')
                    save_cover_data_to(cover_data, cover_path)
            except:
                import traceback
                traceback.print_exc()

            # Metadata
            with open(os.path.join(tdir, u'metadata.opf'), 'wb') as mdataf:
                opf = OPF(io.BytesIO(etree.tostring(oeb_book.metadata.to_opf1(), encoding='UTF-8')))
                mi = opf.to_book_metadata()
                if cover_path:
                    mi.cover = u'cover.jpg'
                mdataf.write(metadata_to_opf(mi))

            htmlz = ZipFile(output_path, 'w')
            htmlz.add_dir(tdir)
示例#22
0
    def run(self, path_to_output, opts, db, notification=DummyReporter()):
        from ebook_converter.library.catalogs.epub_mobi_builder import CatalogBuilder
        from ebook_converter.utils.logging import default_log as log
        from ebook_converter.utils.config import JSONConfig

        # If preset specified from the cli, insert stored options from JSON file
        if hasattr(opts, 'preset') and opts.preset:
            available_presets = JSONConfig("catalog_presets")
            if opts.preset not in available_presets:
                if available_presets:
                    print('Error: Preset "%s" not found.' % opts.preset)
                    print('Stored presets: %s' %
                          ', '.join([p for p in
                                     sorted(available_presets.keys())]))
                else:
                    print('Error: No stored presets.')
                return 1

            # Copy the relevant preset values to the opts object
            for item in available_presets[opts.preset]:
                if item not in ['exclusion_rules_tw', 'format', 'prefix_rules_tw']:
                    setattr(opts, item, available_presets[opts.preset][item])

            # Provide an unconnected device
            opts.connected_device = {
                         'is_device_connected': False,
                         'kind': None,
                         'name': None,
                         'save_template': None,
                         'serial': None,
                         'storage': None,
                        }

            # Convert prefix_rules and exclusion_rules from JSON lists to tuples
            prs = []
            for rule in opts.prefix_rules:
                prs.append(tuple(rule))
            opts.prefix_rules = tuple(prs)

            ers = []
            for rule in opts.exclusion_rules:
                ers.append(tuple(rule))
            opts.exclusion_rules = tuple(ers)

        opts.log = log
        opts.fmt = self.fmt = path_to_output.rpartition('.')[2]

        # Add local options
        opts.creator = '%s, %s %s, %s' % (date.strftime('%A'), date.strftime('%B'), date.strftime('%d').lstrip('0'), date.strftime('%Y'))
        opts.creator_sort_as = '%s %s' % ('calibre', date.strftime('%Y-%m-%d'))
        opts.connected_kindle = False

        # Finalize output_profile
        op = opts.output_profile
        if op is None:
            op = 'default'

        if opts.connected_device['name'] and 'kindle' in opts.connected_device['name'].lower():
            opts.connected_kindle = True
            if opts.connected_device['serial'] and \
               opts.connected_device['serial'][:4] in ['B004', 'B005']:
                op = "kindle_dx"
            else:
                op = "kindle"

        opts.description_clip = 380 if op.endswith('dx') or 'kindle' not in op else 100
        opts.author_clip = 100 if op.endswith('dx') or 'kindle' not in op else 60
        opts.output_profile = op

        opts.basename = "Catalog"
        opts.cli_environment = not hasattr(opts, 'sync')

        # Hard-wired to always sort descriptions by author, with series after non-series
        opts.sort_descriptions_by_author = True

        build_log = []

        build_log.append("%s('%s'): Generating %s %sin %s environment, locale: '%s'" %
            (self.name,
             current_library_name(),
             self.fmt,
             'for %s ' % opts.output_profile if opts.output_profile else '',
             'CLI' if opts.cli_environment else 'GUI',
             langcode_to_name(canonicalize_lang(get_lang()), localize=False))
             )

        # If exclude_genre is blank, assume user wants all tags as genres
        if opts.exclude_genre.strip() == '':
            # opts.exclude_genre = '\[^.\]'
            # build_log.append(" converting empty exclude_genre to '\[^.\]'")
            opts.exclude_genre = 'a^'
            build_log.append(" converting empty exclude_genre to 'a^'")
        if opts.connected_device['is_device_connected'] and \
           opts.connected_device['kind'] == 'device':
            if opts.connected_device['serial']:
                build_log.append(" connected_device: '%s' #%s%s " %
                    (opts.connected_device['name'],
                     opts.connected_device['serial'][0:4],
                     'x' * (len(opts.connected_device['serial']) - 4)))
                for storage in opts.connected_device['storage']:
                    if storage:
                        build_log.append("  mount point: %s" % storage)
            else:
                build_log.append(" connected_device: '%s'" % opts.connected_device['name'])
                try:
                    for storage in opts.connected_device['storage']:
                        if storage:
                            build_log.append("  mount point: %s" % storage)
                except:
                    build_log.append("  (no mount points)")
        else:
            build_log.append(" connected_device: '%s'" % opts.connected_device['name'])

        opts_dict = vars(opts)
        if opts_dict['ids']:
            build_log.append(" book count: %d" % len(opts_dict['ids']))

        sections_list = []
        if opts.generate_authors:
            sections_list.append('Authors')
        if opts.generate_titles:
            sections_list.append('Titles')
        if opts.generate_series:
            sections_list.append('Series')
        if opts.generate_genres:
            sections_list.append('Genres')
        if opts.generate_recently_added:
            sections_list.append('Recently Added')
        if opts.generate_descriptions:
            sections_list.append('Descriptions')

        if not sections_list:
            if opts.cli_environment:
                opts.log.warn('*** No Section switches specified, enabling all Sections ***')
                opts.generate_authors = True
                opts.generate_titles = True
                opts.generate_series = True
                opts.generate_genres = True
                opts.generate_recently_added = True
                opts.generate_descriptions = True
                sections_list = ['Authors', 'Titles', 'Series', 'Genres', 'Recently Added', 'Descriptions']
            else:
                opts.log.warn('\n*** No enabled Sections, terminating catalog generation ***')
                return ["No Included Sections", "No enabled Sections.\nCheck E-book options tab\n'Included sections'\n"]
        if opts.fmt == 'mobi' and sections_list == ['Descriptions']:
            warning = ("\n*** Adding 'By authors' section required for MOBI "
                       "output ***")
            opts.log.warn(warning)
            sections_list.insert(0, 'Authors')
            opts.generate_authors = True

        opts.log(" Sections: %s" % ', '.join(sections_list))
        opts.section_list = sections_list

        # Limit thumb_width to 1.0" - 2.0"
        try:
            if float(opts.thumb_width) < float(self.THUMB_SMALLEST):
                log.warning("coercing thumb_width from '%s' to '%s'" % (opts.thumb_width, self.THUMB_SMALLEST))
                opts.thumb_width = self.THUMB_SMALLEST
            if float(opts.thumb_width) > float(self.THUMB_LARGEST):
                log.warning("coercing thumb_width from '%s' to '%s'" % (opts.thumb_width, self.THUMB_LARGEST))
                opts.thumb_width = self.THUMB_LARGEST
            opts.thumb_width = "%.2f" % float(opts.thumb_width)
        except:
            log.error("coercing thumb_width from '%s' to '%s'" % (opts.thumb_width, self.THUMB_SMALLEST))
            opts.thumb_width = "1.0"

        # eval prefix_rules if passed from command line
        if type(opts.prefix_rules) is not tuple:
            try:
                opts.prefix_rules = eval(opts.prefix_rules)
            except:
                log.error("malformed --prefix-rules: %s" % opts.prefix_rules)
                raise
            for rule in opts.prefix_rules:
                if len(rule) != 4:
                    log.error("incorrect number of args for --prefix-rules: %s" % repr(rule))

        # eval exclusion_rules if passed from command line
        if type(opts.exclusion_rules) is not tuple:
            try:
                opts.exclusion_rules = eval(opts.exclusion_rules)
            except:
                log.error("malformed --exclusion-rules: %s" % opts.exclusion_rules)
                raise
            for rule in opts.exclusion_rules:
                if len(rule) != 3:
                    log.error("incorrect number of args for --exclusion-rules: %s" % repr(rule))

        # Display opts
        keys = sorted(opts_dict.keys())
        build_log.append(" opts:")
        for key in keys:
            if key in ['catalog_title', 'author_clip', 'connected_kindle', 'creator',
                       'cross_reference_authors', 'description_clip', 'exclude_book_marker',
                       'exclude_genre', 'exclude_tags', 'exclusion_rules', 'fmt',
                       'genre_source_field', 'header_note_source_field', 'merge_comments_rule',
                       'output_profile', 'prefix_rules', 'preset', 'read_book_marker',
                       'search_text', 'sort_by', 'sort_descriptions_by_author', 'sync',
                       'thumb_width', 'use_existing_cover', 'wishlist_tag']:
                build_log.append("  %s: %s" % (key, repr(opts_dict[key])))
        if opts.verbose:
            log('\n'.join(line for line in build_log))

        # Capture start_time
        opts.start_time = time.time()

        self.opts = opts

        if opts.verbose:
            log.info(" Begin catalog source generation (%s)" %
                     str(datetime.timedelta(seconds=int(time.time() - opts.start_time))))

        # Launch the Catalog builder
        catalog = CatalogBuilder(db, opts, self, report_progress=notification)

        try:
            catalog.build_sources()
            if opts.verbose:
                log.info(" Completed catalog source generation (%s)\n"  %
                         str(datetime.timedelta(seconds=int(time.time() - opts.start_time))))
        except (AuthorSortMismatchException, EmptyCatalogException) as e:
            log.error(" *** Terminated catalog generation: %s ***" % e)
        except:
            log.error(" unhandled exception in catalog generator")
            raise

        else:
            recommendations = []
            recommendations.append(('remove_fake_margins', False,
                OptionRecommendation.HIGH))
            recommendations.append(('comments', '', OptionRecommendation.HIGH))

            """
            >>> Use to debug generated catalog code before pipeline conversion <<<
            """
            GENERATE_DEBUG_EPUB = False
            if GENERATE_DEBUG_EPUB:
                catalog_debug_path = os.path.join(os.path.expanduser('~'), 'Desktop', 'Catalog debug')
                setattr(opts, 'debug_pipeline', os.path.expanduser(catalog_debug_path))

            dp = getattr(opts, 'debug_pipeline', None)
            if dp is not None:
                recommendations.append(('debug_pipeline', dp,
                    OptionRecommendation.HIGH))

            if opts.output_profile and opts.output_profile.startswith("kindle"):
                recommendations.append(('output_profile', opts.output_profile,
                    OptionRecommendation.HIGH))
                recommendations.append(('book_producer', opts.output_profile,
                    OptionRecommendation.HIGH))
                if opts.fmt == 'mobi':
                    recommendations.append(('no_inline_toc', True,
                        OptionRecommendation.HIGH))
                    recommendations.append(('verbose', 2,
                        OptionRecommendation.HIGH))

            # Use existing cover or generate new cover
            cpath = None
            existing_cover = False
            try:
                search_text = 'title:"%s" author:%s' % (
                        opts.catalog_title.replace('"', '\\"'), 'calibre')
                matches = db.search(search_text, return_matches=True, sort_results=False)
                if matches:
                    cpath = db.cover(matches[0], index_is_id=True, as_path=True)
                    if cpath and os.path.exists(cpath):
                        existing_cover = True
            except:
                pass

            if self.opts.use_existing_cover and not existing_cover:
                log.warning("no existing catalog cover found")

            if self.opts.use_existing_cover and existing_cover:
                recommendations.append(('cover', cpath, OptionRecommendation.HIGH))
                log.info("using existing catalog cover")
            else:
                # TODO(gryf): feature: generating cover with pillow.
                pass
                # from ebook_converter.ebooks.covers import calibre_cover2
                # log.info("replacing catalog cover")
                # new_cover_path = PersistentTemporaryFile(suffix='.jpg')
                # # new_cover = calibre_cover2(opts.catalog_title, 'calibre')
                # new_cover_path.write('')
                # new_cover_path.close()
                # recommendations.append(('cover', new_cover_path.name, OptionRecommendation.HIGH))

            # Run ebook-convert
            from ebook_converter.ebooks.conversion.plumber import Plumber
            plumber = Plumber(os.path.join(catalog.catalog_path, opts.basename + '.opf'),
                            path_to_output, log, report_progress=notification,
                            abort_after_input_dump=False)
            plumber.merge_ui_recommendations(recommendations)
            plumber.run()

            try:
                os.remove(cpath)
            except:
                pass

            if GENERATE_DEBUG_EPUB:
                from ebook_converter.ebooks.epub import initialize_container
                from ebook_converter.ebooks.tweak import zip_rebuilder
                from ebook_converter.utils.zipfile import ZipFile
                input_path = os.path.join(catalog_debug_path, 'input')
                epub_shell = os.path.join(catalog_debug_path, 'epub_shell.zip')
                initialize_container(epub_shell, opf_name='content.opf')
                with ZipFile(epub_shell, 'r') as zf:
                    zf.extractall(path=input_path)
                os.remove(epub_shell)
                zip_rebuilder(input_path, os.path.join(catalog_debug_path, 'input.epub'))

            if opts.verbose:
                log.info(" Catalog creation complete (%s)\n" %
                     str(datetime.timedelta(seconds=int(time.time() - opts.start_time))))

        # returns to gui2.actions.catalog:catalog_generated()
        return catalog.error
示例#23
0
    def convert(self, oeb, output_path, input_plugin, opts, log):
        self.log, self.opts, self.oeb = log, opts, oeb

        if self.opts.epub_inline_toc:
            from ebook_converter.ebooks.mobi.writer8.toc import TOCAdder
            opts.mobi_toc_at_start = not opts.epub_toc_at_end
            opts.mobi_passthrough = False
            opts.no_inline_toc = False
            TOCAdder(oeb, opts, replace_previous_inline_toc=True, ignore_existing_toc=True)

        if self.opts.epub_flatten:
            from ebook_converter.ebooks.oeb.transforms.filenames import FlatFilenames
            FlatFilenames()(oeb, opts)
        else:
            from ebook_converter.ebooks.oeb.transforms.filenames import UniqueFilenames
            UniqueFilenames()(oeb, opts)

        self.workaround_ade_quirks()
        self.workaround_webkit_quirks()
        self.upshift_markup()
        from ebook_converter.ebooks.oeb.transforms.rescale import RescaleImages
        RescaleImages(check_colorspaces=True)(oeb, opts)

        from ebook_converter.ebooks.oeb.transforms.split import Split
        split = Split(not self.opts.dont_split_on_page_breaks,
                max_flow_size=self.opts.flow_size*1024
                )
        split(self.oeb, self.opts)

        from ebook_converter.ebooks.oeb.transforms.cover import CoverManager
        cm = CoverManager(
                no_default_cover=self.opts.no_default_epub_cover,
                no_svg_cover=self.opts.no_svg_cover,
                preserve_aspect_ratio=self.opts.preserve_cover_aspect_ratio)
        cm(self.oeb, self.opts, self.log)

        self.workaround_sony_quirks()

        if self.oeb.toc.count() == 0:
            self.log.warn('This EPUB file has no Table of Contents. '
                    'Creating a default TOC')
            first = next(iter(self.oeb.spine))
            self.oeb.toc.add('Start', first.href)

        identifiers = oeb.metadata['identifier']
        _uuid = None
        for x in identifiers:
            if (x.get(base.tag('opf', 'scheme'), None).lower() == 'uuid' or
                    str(x).startswith('urn:uuid:')):
                _uuid = str(x).split(':')[-1]
                break
        encrypted_fonts = getattr(input_plugin, 'encrypted_fonts', [])

        if _uuid is None:
            self.log.warn('No UUID identifier found')
            _uuid = str(uuid.uuid4())
            oeb.metadata.add('identifier', _uuid, scheme='uuid', id=_uuid)

        if encrypted_fonts and not _uuid.startswith('urn:uuid:'):
            # Apparently ADE requires this value to start with urn:uuid:
            # for some absurd reason, or it will throw a hissy fit and refuse
            # to use the obfuscated fonts.
            for x in identifiers:
                if str(x) == _uuid:
                    x.content = 'urn:uuid:' + _uuid

        with TemporaryDirectory('_epub_output') as tdir:
            from ebook_converter.customize.ui import plugin_for_output_format
            metadata_xml = None
            extra_entries = []
            if self.is_periodical:
                if self.opts.output_profile.epub_periodical_format == 'sony':
                    from ebook_converter.ebooks.epub.periodical import sony_metadata
                    metadata_xml, atom_xml = sony_metadata(oeb)
                    extra_entries = [('atom.xml', 'application/atom+xml', atom_xml)]
            oeb_output = plugin_for_output_format('oeb')
            oeb_output.convert(oeb, tdir, input_plugin, opts, log)
            opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0]
            self.condense_ncx([os.path.join(tdir, x) for x in os.listdir(tdir)
                    if x.endswith('.ncx')][0])
            if self.opts.epub_version == '3':
                self.upgrade_to_epub3(tdir, opf)
            encryption = None
            if encrypted_fonts:
                encryption = self.encrypt_fonts(encrypted_fonts, tdir, _uuid)

            from ebook_converter.ebooks.epub import initialize_container
            with initialize_container(output_path, os.path.basename(opf),
                    extra_entries=extra_entries) as epub:
                epub.add_dir(tdir)
                if encryption is not None:
                    epub.writestr('META-INF/encryption.xml', as_bytes(encryption))
                if metadata_xml is not None:
                    epub.writestr('META-INF/metadata.xml',
                            metadata_xml.encode('utf-8'))
            if opts.extract_to is not None:
                from ebook_converter.utils.zipfile import ZipFile
                if os.path.exists(opts.extract_to):
                    if os.path.isdir(opts.extract_to):
                        shutil.rmtree(opts.extract_to)
                    else:
                        os.remove(opts.extract_to)
                os.mkdir(opts.extract_to)
                with ZipFile(output_path) as zf:
                    zf.extractall(path=opts.extract_to)
                self.log.info('EPUB extracted to', opts.extract_to)
示例#24
0
    def convert(self, recipe_or_file, opts, file_ext, log,
            accelerators):
        from ebook_converter.web.feeds.recipes import compile_recipe
        opts.output_profile.flow_size = 0
        if file_ext == 'downloaded_recipe':
            from ebook_converter.utils.zipfile import ZipFile
            zf = ZipFile(recipe_or_file, 'r')
            zf.extractall()
            zf.close()
            with open('download.recipe', 'rb') as f:
                self.recipe_source = f.read()
            recipe = compile_recipe(self.recipe_source)
            recipe.needs_subscription = False
            self.recipe_object = recipe(opts, log, self.report_progress)
        else:
            if os.environ.get('CALIBRE_RECIPE_URN'):
                from ebook_converter.web.feeds.recipes.collection import get_custom_recipe, get_builtin_recipe_by_id
                urn = os.environ['CALIBRE_RECIPE_URN']
                log('Downloading recipe urn: ' + urn)
                rtype, recipe_id = urn.partition(':')[::2]
                if not recipe_id:
                    raise ValueError('Invalid recipe urn: ' + urn)
                if rtype == 'custom':
                    self.recipe_source = get_custom_recipe(recipe_id)
                else:
                    self.recipe_source = get_builtin_recipe_by_id(urn, log=log, download_recipe=True)
                if not self.recipe_source:
                    raise ValueError('Could not find recipe with urn: ' + urn)
                if not isinstance(self.recipe_source, bytes):
                    self.recipe_source = self.recipe_source.encode('utf-8')
                recipe = compile_recipe(self.recipe_source)
            elif os.access(recipe_or_file, os.R_OK):
                with open(recipe_or_file, 'rb') as f:
                    self.recipe_source = f.read()
                recipe = compile_recipe(self.recipe_source)
                log('Using custom recipe')
            else:
                from ebook_converter.web.feeds.recipes.collection import (
                        get_builtin_recipe_by_title, get_builtin_recipe_titles)
                title = getattr(opts, 'original_recipe_input_arg', recipe_or_file)
                title = os.path.basename(title).rpartition('.')[0]
                titles = frozenset(get_builtin_recipe_titles())
                if title not in titles:
                    title = getattr(opts, 'original_recipe_input_arg', recipe_or_file)
                    title = title.rpartition('.')[0]

                raw = get_builtin_recipe_by_title(title, log=log,
                        download_recipe=not opts.dont_download_recipe)
                builtin = False
                try:
                    recipe = compile_recipe(raw)
                    self.recipe_source = raw
                    if recipe.requires_version > numeric_version:
                        log.warn(
                        'Downloaded recipe needs calibre version at least: %s' %
                        ('.'.join(recipe.requires_version)))
                        builtin = True
                except:
                    log.exception('Failed to compile downloaded recipe. Falling '
                            'back to builtin one')
                    builtin = True
                if builtin:
                    log('Using bundled builtin recipe')
                    raw = get_builtin_recipe_by_title(title, log=log,
                            download_recipe=False)
                    if raw is None:
                        raise ValueError('Failed to find builtin recipe: '+title)
                    recipe = compile_recipe(raw)
                    self.recipe_source = raw
                else:
                    log('Using downloaded builtin recipe')

            if recipe is None:
                raise ValueError('%r is not a valid recipe file or builtin recipe' %
                        recipe_or_file)

            disabled = getattr(recipe, 'recipe_disabled', None)
            if disabled is not None:
                raise RecipeDisabled(disabled)
            ro = recipe(opts, log, self.report_progress)
            ro.download()
            self.recipe_object = ro

        for key, val in self.recipe_object.conversion_options.items():
            setattr(opts, key, val)

        for f in os.listdir('.'):
            if f.endswith('.opf'):
                return os.path.abspath(f)

        for f in walk('.'):
            if f.endswith('.opf'):
                return os.path.abspath(f)
示例#25
0
    def convert(self, stream, options, file_ext, log, accelerators):
        from ebook_converter.utils.zipfile import ZipFile
        from ebook_converter.ebooks import DRMError

        _path_or_stream = getattr(stream, 'name', 'stream')
        try:
            zf = ZipFile(stream)
            zf.extractall(os.getcwd())
        except Exception:
            log.exception('EPUB appears to be invalid ZIP file, trying a '
                          'more forgiving ZIP parser')
            from ebook_converter.utils.localunzip import extractall
            stream.seek(0)
            extractall(stream)
        encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml'))
        opf = self.find_opf()
        if opf is None:
            for root, _, fnames in os.walk('.'):
                for f in fnames:
                    f = os.path.join(root, f)
                    if f.lower().endswith('.opf') and '__MACOSX' not in f and \
                            not os.path.basename(f).startswith('.'):
                        opf = os.path.abspath(f)
                        break

        if opf is None:
            raise ValueError('%s is not a valid EPUB file (could not find '
                             'opf)' % _path_or_stream)

        opf = os.path.relpath(opf, os.getcwd())
        parts = os.path.split(opf)
        opf = opf_meta.OPF(opf, os.path.dirname(os.path.abspath(opf)))

        self._encrypted_font_uris = []
        if os.path.exists(encfile):
            if not self.process_encryption(encfile, opf, log):
                raise DRMError(os.path.basename(_path_or_stream))
        self.encrypted_fonts = self._encrypted_font_uris

        # NOTE(gryf): check if opf is nested on the directory(ies), if so,
        # update the links for guide and manifest.
        if len(parts) > 1 and parts[0]:
            path = os.path.join(parts[0])

            for elem in opf.itermanifest():
                elem.set('href', os.path.join(path, elem.get('href')))
            for elem in opf.iterguide():
                elem.set('href', os.path.join(path, elem.get('href')))

        if opf.package_version >= 3.0:
            f = self.rationalize_cover3
        else:
            f = self.rationalize_cover2
        self.removed_cover = f(opf, log)
        if self.removed_cover:
            self.removed_items_to_ignore = (self.removed_cover, )
        epub3_nav = opf.epub3_nav
        if epub3_nav is not None:
            self.convert_epub3_nav(epub3_nav, opf, log, options)

        for x in opf.itermanifest():
            if x.get('media-type', '') == 'application/x-dtbook+xml':
                raise ValueError(
                    'EPUB files with DTBook markup are not supported')

        not_for_spine = set()
        for y in opf.itermanifest():
            id_ = y.get('id', None)
            if id_:
                mt = y.get('media-type', None)
                if mt in {
                        'application/vnd.adobe-page-template+xml',
                        'application/vnd.adobe.page-template+xml',
                        'application/adobe-page-template+xml',
                        'application/adobe.page-template+xml',
                        'application/text'
                }:
                    not_for_spine.add(id_)
                ext = y.get('href', '').rpartition('.')[-1].lower()
                if mt == 'text/plain' and ext in {'otf', 'ttf'}:
                    # some epub authoring software sets font mime types to
                    # text/plain
                    not_for_spine.add(id_)
                    y.set('media-type', 'application/font')

        seen = set()
        for x in list(opf.iterspine()):
            ref = x.get('idref', None)
            if not ref or ref in not_for_spine or ref in seen:
                x.getparent().remove(x)
                continue
            seen.add(ref)

        if len(list(opf.iterspine())) == 0:
            raise ValueError('No valid entries in the spine of this EPUB')

        with open('content.opf', 'wb') as nopf:
            nopf.write(opf.render())

        return os.path.abspath('content.opf')