예제 #1
0
def read_cover(stream, zin, mi, opfmeta, extract_cover):
    # search for an draw:image in a draw:frame with the name 'opf.cover'
    # if opf.metadata prop is false, just use the first image that
    # has a proper size (borrowed from docx)
    otext = odLoad(stream)
    cover_href = None
    cover_data = None
    cover_frame = None
    imgnum = 0
    for frm in otext.topnode.getElementsByType(odFrame):
        img = frm.getElementsByType(odImage)
        if len(img) == 0:
            continue
        i_href = img[0].getAttribute('href')
        try:
            raw = zin.read(i_href)
        except KeyError:
            continue
        try:
            fmt, width, height = identify(raw)
        except Exception:
            continue
        imgnum += 1
        if opfmeta and frm.getAttribute('name').lower() == 'opf.cover':
            cover_href = i_href
            cover_data = (fmt, raw)
            cover_frame = frm.getAttribute('name')  # could have upper case
            break
        if cover_href is None and imgnum == 1 and 0.8 <= height / width <= 1.8 and height * width >= 12000:
            # Pick the first image as the cover if it is of a suitable size
            cover_href = i_href
            cover_data = (fmt, raw)
            if not opfmeta:
                break

    if cover_href is not None:
        mi.cover = cover_href
        mi.odf_cover_frame = cover_frame
        if extract_cover:
            if not cover_data:
                raw = zin.read(cover_href)
                try:
                    fmt = identify(raw)[0]
                except Exception:
                    pass
                else:
                    cover_data = (fmt, raw)
            mi.cover_data = cover_data
예제 #2
0
 def inspect_cover(self, href):
     from ebook_converter.ebooks.oeb.base import urlnormalize
     for x in self.oeb.manifest:
         if x.href == urlnormalize(href):
             try:
                 raw = x.data
                 return identify(raw)[1:]
             except Exception:
                 self.log.exception('Failed to read cover image dimensions')
     return -1, -1
예제 #3
0
 def read_image(self, href):
     if href not in self.images:
         item = self.oeb.manifest.hrefs.get(href)
         if item is None or not isinstance(item.data, bytes):
             return
         try:
             fmt, width, height = identify(item.data)
         except Exception:
             self.log.warning('Replacing corrupted image with blank: %s' %
                              href)
             item.data = I('blank.png',
                           data=True,
                           allow_user_override=False)
             fmt, width, height = identify(item.data)
         image_fname = 'media/' + self.create_filename(href, fmt)
         image_rid = self.document_relationships.add_image(image_fname)
         self.images[href] = Image(image_rid, image_fname, width, height,
                                   fmt, item)
         item.unload_data_from_memory()
     return self.images[href]
예제 #4
0
def _parse_cover_data(root, imgid, mi, ctx):
    from ebook_converter.ebooks.fb2 import base64_decode
    elm_binary = ctx.XPath('//fb:binary[@id="%s"]' % imgid)(root)
    if elm_binary:
        mimetype = elm_binary[0].get('content-type', 'image/jpeg')
        mime_extensions = mimetypes.guess_all_extensions(mimetype)

        if not mime_extensions and mimetype.startswith('image/'):
            mimetype_fromid = mimetypes.guess_type(imgid)[0]
            if mimetype_fromid and mimetype_fromid.startswith('image/'):
                mime_extensions = (mimetypes
                                   .guess_all_extensions(mimetype_fromid))

        if mime_extensions:
            pic_data = elm_binary[0].text
            if pic_data:
                cdata = base64_decode(pic_data.strip())
                fmt = identify(cdata)[0]
                mi.cover_data = (fmt, cdata)
        else:
            print(f"WARNING: Unsupported coverpage mime-type '{mimetype}' "
                  f"(id=#{imgid})")
예제 #5
0
    def mobimlize_elem(self, elem, stylizer, bstate, istates,
            ignore_valign=False):
        if not isinstance(elem.tag, (str, bytes)) \
           or parse_utils.namespace(elem.tag) != const.XHTML_NS:
            return
        style = stylizer.style(elem)
        # <mbp:frame-set/> does not exist lalalala
        if ((style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') or style['visibility'] == 'hidden') and
                elem.get('data-calibre-jacket-searchable-tags', None) != '1'):
            id_ = elem.get('id', None)
            if id_:
                # Keep anchors so people can use display:none
                # to generate hidden TOCs
                tail = elem.tail
                elem.clear()
                elem.text = None
                elem.set('id', id_)
                elem.tail = tail
                elem.tag = base.tag('xhtml', 'a')
            else:
                return
        tag = parse_utils.barename(elem.tag)
        istate = copy.copy(istates[-1])
        istate.rendered = False
        istate.list_num = 0
        if tag == 'ol' and 'start' in elem.attrib:
            try:
                istate.list_num = int(elem.attrib['start'])-1
            except:
                pass
        istates.append(istate)
        left = 0
        display = style['display']
        if display == 'table-cell':
            display = 'inline'
        elif display.startswith('table'):
            display = 'block'
        isblock = (not display.startswith('inline') and style['display'] !=
                'none')
        isblock = isblock and style['float'] == 'none'
        isblock = isblock and tag != 'br'
        if isblock:
            bstate.para = None
            istate.halign = style['text-align']
            rawti = style._get('text-indent')
            istate.indent = style['text-indent']
            if hasattr(rawti, 'strip') and '%' in rawti:
                # We have a percentage text indent, these can come out looking
                # too large if the user chooses a wide output profile like
                # tablet
                istate.indent = min(style._unit_convert(rawti, base=500), istate.indent)
            if style['margin-left'] == 'auto' \
               and style['margin-right'] == 'auto':
                istate.halign = 'center'
            margin = asfloat(style['margin-left'])
            padding = asfloat(style['padding-left'])
            if tag != 'body':
                left = margin + padding
            istate.left += left
            vmargin = asfloat(style['margin-top'])
            bstate.vmargin = max((bstate.vmargin, vmargin))
            vpadding = asfloat(style['padding-top'])
            if vpadding > 0:
                bstate.vpadding += bstate.vmargin
                bstate.vmargin = 0
                bstate.vpadding += vpadding
        elif not istate.href:
            margin = asfloat(style['margin-left'])
            padding = asfloat(style['padding-left'])
            lspace = margin + padding
            if lspace > 0:
                spaces = int(round((lspace * 3) / style['font-size']))
                elem.text = ('\xa0' * spaces) + (elem.text or '')
            margin = asfloat(style['margin-right'])
            padding = asfloat(style['padding-right'])
            rspace = margin + padding
            if rspace > 0:
                spaces = int(round((rspace * 3) / style['font-size']))
                if len(elem) == 0:
                    elem.text = (elem.text or '') + ('\xa0' * spaces)
                else:
                    last = elem[-1]
                    last.text = (last.text or '') + ('\xa0' * spaces)
        if bstate.content and style['page-break-before'] in PAGE_BREAKS:
            bstate.pbreak = True
        istate.fsize = self.mobimlize_font(style['font-size'])
        istate.italic = True if style['font-style'] == 'italic' else False
        weight = style['font-weight']
        istate.bold = weight in ('bold', 'bolder') or asfloat(weight) > 400
        istate.preserve = style['white-space'] == 'pre'
        istate.pre_wrap = style['white-space'] == 'pre-wrap'
        istate.bgcolor  = style['background-color']
        istate.fgcolor  = style['color']
        istate.strikethrough = style.effective_text_decoration == 'line-through'
        istate.underline = style.effective_text_decoration == 'underline'
        ff = style['font-family'].lower() if hasattr(style['font-family'], 'lower') else ''
        if 'monospace' in ff or 'courier' in ff or ff.endswith(' mono'):
            istate.family = 'monospace'
        elif ('sans-serif' in ff or 'sansserif' in ff or 'verdana' in ff or
                'arial' in ff or 'helvetica' in ff):
            istate.family = 'sans-serif'
        else:
            istate.family = 'serif'
        if 'id' in elem.attrib:
            istate.ids.add(elem.attrib['id'])
        if 'name' in elem.attrib:
            istate.ids.add(elem.attrib['name'])
        if tag == 'a' and 'href' in elem.attrib:
            istate.href = elem.attrib['href']
        istate.attrib.clear()
        if tag == 'img' and 'src' in elem.attrib:
            istate.attrib['src'] = elem.attrib['src']
            istate.attrib['align'] = 'baseline'
            cssdict = style.cssdict()
            valign = cssdict.get('vertical-align', None)
            if valign in ('top', 'bottom', 'middle'):
                istate.attrib['align'] = valign
            for prop in ('width', 'height'):
                if cssdict[prop] != 'auto':
                    value = style[prop]
                    if value == getattr(self.profile, prop):
                        result = '100%'
                    else:
                        # Amazon's renderer does not support
                        # img sizes in units other than px
                        # See #7520 for test case
                        try:
                            pixs = int(round(float(value) /
                                (72/self.profile.dpi)))
                        except:
                            continue
                        result = str(pixs)
                    istate.attrib[prop] = result
            if 'width' not in istate.attrib or 'height' not in istate.attrib:
                href = self.current_spine_item.abshref(elem.attrib['src'])
                try:
                    item = self.oeb.manifest.hrefs[base.urlnormalize(href)]
                except:
                    self.oeb.logger.warn('Failed to find image:',
                            href)
                else:
                    try:
                        width, height = identify(item.data)[1:]
                    except Exception:
                        self.oeb.logger.warn('Invalid image:', href)
                    else:
                        if 'width' not in istate.attrib and 'height' not in \
                                    istate.attrib:
                            istate.attrib['width'] = str(width)
                            istate.attrib['height'] = str(height)
                        else:
                            ar = width / height
                            if 'width' not in istate.attrib:
                                try:
                                    width = int(istate.attrib['height'])*ar
                                except:
                                    pass
                                istate.attrib['width'] = str(int(width))
                            else:
                                try:
                                    height = int(istate.attrib['width'])/ar
                                except:
                                    pass
                                istate.attrib['height'] = str(int(height))
                        item.unload_data_from_memory()
        elif tag == 'hr' and asfloat(style['width']) > 0 and style._get('width') not in {'100%', 'auto'}:
            raww = style._get('width')
            if hasattr(raww, 'strip') and '%' in raww:
                istate.attrib['width'] = raww
            else:
                prop = style['width'] / self.profile.width
                istate.attrib['width'] = "%d%%" % int(round(prop * 100))
        elif display == 'table':
            tag = 'table'
        elif display == 'table-row':
            tag = 'tr'
        elif display == 'table-cell':
            tag = 'td'
        if tag in TABLE_TAGS and self.ignore_tables:
            tag = 'span' if tag == 'td' else 'div'

        if tag in ('table', 'td', 'tr'):
            col = style.backgroundColor
            if col:
                elem.set('bgcolor', col)
            css = style.cssdict()
            if 'border' in css or 'border-width' in css:
                elem.set('border', '1')
        if tag in TABLE_TAGS:
            for attr in ('rowspan', 'colspan', 'width', 'border', 'scope',
                    'bgcolor'):
                if attr in elem.attrib:
                    istate.attrib[attr] = elem.attrib[attr]
        if tag == 'q':
            t = elem.text
            if not t:
                t = ''
            elem.text = '\u201c' + t
            t = elem.tail
            if not t:
                t = ''
            elem.tail = '\u201d' + t
        text = None
        if elem.text:
            if istate.preserve or istate.pre_wrap:
                text = elem.text
            elif (len(elem) > 0 and isspace(elem.text) and hasattr(elem[0].tag, 'rpartition') and
                  elem[0].tag.rpartition('}')[-1] not in INLINE_TAGS):
                text = None
            else:
                text = COLLAPSE.sub(' ', elem.text)
        valign = style['vertical-align']
        not_baseline = valign in ('super', 'sub', 'text-top',
                'text-bottom', 'top', 'bottom') or (
                isinstance(valign, numbers.Number) and abs(valign) != 0)
        issup = valign in ('super', 'text-top', 'top') or (
            isinstance(valign, numbers.Number) and valign > 0)
        vtag = 'sup' if issup else 'sub'
        if not_baseline and not ignore_valign and tag not in NOT_VTAGS and not isblock:
            nroot = etree.Element(base.tag('xhtml', 'html'), nsmap=MOBI_NSMAP)
            vbstate = BlockState(etree.SubElement(nroot, base.tag('xhtml', 'body')))
            vbstate.para = etree.SubElement(vbstate.body, base.tag('xhtml', 'p'))
            self.mobimlize_elem(elem, stylizer, vbstate, istates,
                    ignore_valign=True)
            if len(istates) > 0:
                istates.pop()
            if len(istates) == 0:
                istates.append(FormatState())
            at_start = bstate.para is None
            if at_start:
                self.mobimlize_content('span', '', bstate, istates)
            parent = bstate.para if bstate.inline is None else bstate.inline
            if parent is not None:
                vtag = etree.SubElement(parent, base.tag('xhtml', vtag))
                vtag = etree.SubElement(vtag, base.tag('xhtml', 'small'))
                # Add anchors
                for child in vbstate.body:
                    if child is not vbstate.para:
                        vtag.append(child)
                    else:
                        break
                if vbstate.para is not None:
                    if vbstate.para.text:
                        vtag.text = vbstate.para.text
                    for child in vbstate.para:
                        vtag.append(child)
                return

        if tag == 'blockquote':
            old_mim = self.opts.mobi_ignore_margins
            self.opts.mobi_ignore_margins = False

        if (text or tag in CONTENT_TAGS or tag in NESTABLE_TAGS or (
                # We have an id but no text and no children, the id should still
                # be added.
                istate.ids and tag in ('a', 'span', 'i', 'b', 'u') and
                len(elem)==0)):
            if tag == 'li' and len(istates) > 1 and 'value' in elem.attrib:
                try:
                    value = int(elem.attrib['value'])
                    istates[-2].list_num = value - 1
                except:
                    pass
            self.mobimlize_content(tag, text, bstate, istates)
        for child in elem:
            self.mobimlize_elem(child, stylizer, bstate, istates)
            tail = None
            if child.tail:
                if istate.preserve or istate.pre_wrap:
                    tail = child.tail
                elif bstate.para is None and isspace(child.tail):
                    tail = None
                else:
                    tail = COLLAPSE.sub(' ', child.tail)
            if tail:
                self.mobimlize_content(tag, tail, bstate, istates)

        if tag == 'blockquote':
            self.opts.mobi_ignore_margins = old_mim

        if bstate.content and style['page-break-after'] in PAGE_BREAKS:
            bstate.pbreak = True
        if isblock:
            para = bstate.para
            if para is not None and para.text == '\xa0' and len(para) < 1:
                if style.height > 2:
                    para.getparent().replace(para, etree.Element(base.tag('xhtml', 'br')))
                else:
                    # This is too small to be rendered effectively, drop it
                    para.getparent().remove(para)
            bstate.para = None
            bstate.istate = None
            vmargin = asfloat(style['margin-bottom'])
            bstate.vmargin = max((bstate.vmargin, vmargin))
            vpadding = asfloat(style['padding-bottom'])
            if vpadding > 0:
                bstate.vpadding += bstate.vmargin
                bstate.vmargin = 0
                bstate.vpadding += vpadding
        if bstate.nested and bstate.nested[-1].tag == elem.tag:
            bstate.nested.pop()
        istates.pop()
예제 #6
0
def cleanup_markup(log, root, styles, dest_dir, detect_cover, XPath):
    # Apply vertical-align
    for span in root.xpath('//span[@data-docx-vert]'):
        wrap_contents(span.attrib.pop('data-docx-vert'), span)

    # Move <hr>s outside paragraphs, if possible.
    pancestor = XPath('|'.join('ancestor::%s[1]' % x for x in ('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6')))
    for hr in root.xpath('//span/hr'):
        p = pancestor(hr)
        if p:
            p = p[0]
            descendants = tuple(p.iterdescendants())
            if descendants[-1] is hr:
                parent = p.getparent()
                idx = parent.index(p)
                parent.insert(idx+1, hr)
                hr.tail = '\n\t'

    # Merge consecutive spans that have the same styling
    current_run = []
    for span in root.xpath('//span'):
        if not current_run:
            current_run.append(span)
        else:
            last = current_run[-1]
            if mergeable(last, span):
                current_run.append(span)
            else:
                if len(current_run) > 1:
                    merge_run(current_run)
                current_run = [span]

    # Process dir attributes
    class_map = dict(styles.classes.values())
    parents = ('p', 'div') + tuple('h%d' % i for i in range(1, 7))
    for parent in root.xpath('//*[(%s)]' % ' or '.join('name()="%s"' % t for t in parents)):
        # Ensure that children of rtl parents that are not rtl have an
        # explicit dir set. Also, remove dir from children if it is the same as
        # that of the parent.
        if len(parent):
            parent_dir = parent.get('dir')
            for child in parent.iterchildren('span'):
                child_dir = child.get('dir')
                if parent_dir == 'rtl' and child_dir != 'rtl':
                    child_dir = 'ltr'
                    child.set('dir', child_dir)
                if child_dir and child_dir == parent_dir:
                    child.attrib.pop('dir')

    # Remove unnecessary span tags that are the only child of a parent block
    # element
    for parent in root.xpath('//*[(%s) and count(span)=1]' % ' or '.join('name()="%s"' % t for t in parents)):
        if len(parent) == 1 and not parent.text and not parent[0].tail and not parent[0].get('id', None):
            # We have a block whose contents are entirely enclosed in a <span>
            span = parent[0]
            span_class = span.get('class', None)
            span_css = class_map.get(span_class, {})
            span_dir = span.get('dir')
            if liftable(span_css) and (not span_dir or span_dir == parent.get('dir')):
                pclass = parent.get('class', None)
                if span_class:
                    pclass = (pclass + ' ' + span_class) if pclass else span_class
                    parent.set('class', pclass)
                parent.text = span.text
                parent.remove(span)
                if span.get('lang'):
                    parent.set('lang', span.get('lang'))
                if span.get('dir'):
                    parent.set('dir', span.get('dir'))
                for child in span:
                    parent.append(child)

    # Make spans whose only styling is bold or italic into <b> and <i> tags
    for span in root.xpath('//span[@class and not(@style)]'):
        css = class_map.get(span.get('class', None), {})
        if len(css) == 1:
            if css == {'font-style':'italic'}:
                span.tag = 'i'
                del span.attrib['class']
            elif css == {'font-weight':'bold'}:
                span.tag = 'b'
                del span.attrib['class']

    # Get rid of <span>s that have no styling
    for span in root.xpath('//span[not(@class or @id or @style or @lang or @dir)]'):
        lift(span)

    # Convert <p><br style="page-break-after:always"> </p> style page breaks
    # into something the viewer will render as a page break
    for p in root.xpath('//p[br[@style="page-break-after:always"]]'):
        if len(p) == 1 and (not p[0].tail or not p[0].tail.strip()):
            p.remove(p[0])
            prefix = p.get('style', '')
            if prefix:
                prefix += '; '
            p.set('style', prefix + 'page-break-after:always')
            p.text = NBSP if not p.text else p.text

    if detect_cover:
        # Check if the first image in the document is possibly a cover
        img = root.xpath('//img[@src][1]')
        if img:
            img = img[0]
            path = os.path.join(dest_dir, img.get('src'))
            if os.path.exists(path) and before_count(root, img, limit=10) < 5:
                from ebook_converter.utils.imghdr import identify
                try:
                    with open(path, 'rb') as imf:
                        fmt, width, height = identify(imf)
                except:
                    width, height, fmt = 0, 0, None  # noqa
                del fmt
                try:
                    is_cover = 0.8 <= height/width <= 1.8 and height*width >= 160000
                except ZeroDivisionError:
                    is_cover = False
                if is_cover:
                    log.debug('Detected an image that looks like a cover')
                    img.getparent().remove(img)
                    return path
예제 #7
0
    def extract_content(self, output_dir):
        # Each text record is independent (unless the continuation
        # value is set in the previous record). Put each converted
        # text recored into a separate file. We will reference the
        # home.html file as the first file and let the HTML input
        # plugin assemble the order based on hyperlinks.
        with CurrentDir(output_dir):
            for uid, num in self.uid_text_secion_number.items():
                self.log.debug('Writing record with uid: %s as %s.html' % (uid, uid))
                with open('%s.html' % uid, 'wb') as htmlf:
                    html = u'<html><body>'
                    section_header, section_data = self.sections[num]
                    if section_header.type == DATATYPE_PHTML:
                        html += self.process_phtml(section_data.data, section_data.header.paragraph_offsets)
                    elif section_header.type == DATATYPE_PHTML_COMPRESSED:
                        d = self.decompress_phtml(section_data.data)
                        html += self.process_phtml(d, section_data.header.paragraph_offsets).decode(self.get_text_uid_encoding(section_header.uid), 'replace')
                    html += '</body></html>'
                    htmlf.write(html.encode('utf-8'))

        # Images.
        # Cache the image sizes in case they are used by a composite image.
        images = set()
        if not os.path.exists(os.path.join(output_dir, 'images/')):
            os.makedirs(os.path.join(output_dir, 'images/'))
        with CurrentDir(os.path.join(output_dir, 'images/')):
            # Single images.
            for uid, num in self.uid_image_section_number.items():
                section_header, section_data = self.sections[num]
                if section_data:
                    idata = None
                    if section_header.type == DATATYPE_TBMP:
                        idata = section_data
                    elif section_header.type == DATATYPE_TBMP_COMPRESSED:
                        if self.header_record.compression == 1:
                            idata = decompress_doc(section_data)
                        elif self.header_record.compression == 2:
                            idata = zlib.decompress(section_data)
                    try:
                        save_cover_data_to(idata, '%s.jpg' % uid, compression_quality=70)
                        images.add(uid)
                        self.log.debug('Wrote image with uid %s to images/%s.jpg' % (uid, uid))
                    except Exception as e:
                        self.log.error('Failed to write image with uid %s: %s' % (uid, e))
                else:
                    self.log.error('Failed to write image with uid %s: No data.' % uid)
            # Composite images.
            # We're going to use the already compressed .jpg images here.
            for uid, num in self.uid_composite_image_section_number.items():
                try:
                    section_header, section_data = self.sections[num]
                    # Get the final width and height.
                    width = 0
                    height = 0
                    for row in section_data.layout:
                        row_width = 0
                        col_height = 0
                        for col in row:
                            if col not in images:
                                raise Exception('Image with uid: %s missing.' % col)
                            w, h = identify(open('%s.jpg' % col, 'rb'))[1:]
                            row_width += w
                            if col_height < h:
                                col_height = h
                        if width < row_width:
                            width = row_width
                        height += col_height
                    # Create a new image the total size of all image
                    # parts. Put the parts into the new image.
                    with Canvas(width, height) as canvas:
                        y_off = 0
                        for row in section_data.layout:
                            x_off = 0
                            largest_height = 0
                            for col in row:
                                im = image_from_data(open('%s.jpg' % col, 'rb').read())
                                canvas.compose(im, x_off, y_off)
                                w, h = im.width(), im.height()
                                x_off += w
                                if largest_height < h:
                                    largest_height = h
                            y_off += largest_height
                    with open('%s.jpg' % uid) as out:
                        out.write(canvas.export(compression_quality=70))
                    self.log.debug('Wrote composite image with uid %s to images/%s.jpg' % (uid, uid))
                except Exception as e:
                    self.log.error('Failed to write composite image with uid %s: %s' % (uid, e))

        # Run the HTML through the html processing plugin.
        from ebook_converter.customize.ui import plugin_for_input_format
        html_input = plugin_for_input_format('html')
        for opt in html_input.options:
            setattr(self.options, opt.option.name, opt.recommended_value)
        self.options.input_encoding = 'utf-8'
        odi = self.options.debug_pipeline
        self.options.debug_pipeline = None
        # Determine the home.html record uid. This should be set in the
        # reserved values in the metadata recored. home.html is the first
        # text record (should have hyper link references to other records)
        # in the document.
        try:
            home_html = self.header_record.home_html
            if not home_html:
                home_html = self.uid_text_secion_number.items()[0][0]
        except:
            raise Exception('Could not determine home.html')
        # Generate oeb from html conversion.
        oeb = html_input.convert(open('%s.html' % home_html, 'rb'), self.options, 'html', self.log, {})
        self.options.debug_pipeline = odi

        return oeb