Пример #1
0
    def extract_css(self, root, log):
        ans = []
        for s in root.xpath(
                '//*[local-name() = "style" and @type="text/css"]'):
            ans.append(s.text)
            s.getparent().remove(s)

        head = root.xpath('//*[local-name() = "head"]')
        if head:
            head = head[0]
            ns = head.nsmap.get(None, '')
            if ns:
                ns = '{%s}' % ns
            etree.SubElement(head, ns + 'link', {
                'type': 'text/css',
                'rel': 'stylesheet',
                'href': 'odfpy.css'
            })

        css = u'\n\n'.join(ans)
        parser = CSSParser(loglevel=logging.WARNING, log=_css_logger)
        self.css = parser.parseString(css, validate=False)

        with open('odfpy.css', 'wb') as f:
            f.write(css.encode('utf-8'))
Пример #2
0
def parse_css(data,
              fname='<string>',
              is_declaration=False,
              decode=None,
              log_level=None,
              css_preprocessor=None):
    if log_level is None:
        import logging
        log_level = logging.WARNING
    from css_parser import CSSParser, log
    from calibre.ebooks.oeb.base import _css_logger
    log.setLevel(log_level)
    log.raiseExceptions = False
    data = data or ''
    if isinstance(data, bytes):
        data = data.decode('utf-8') if decode is None else decode(data)
    if css_preprocessor is not None:
        data = css_preprocessor(data)
    parser = CSSParser(
        loglevel=log_level,
        # We dont care about @import rules
        fetcher=lambda x: (None, None),
        log=_css_logger)
    if is_declaration:
        data = parser.parseStyle(data, validate=False)
    else:
        data = parser.parseString(data, href=fname, validate=False)
    return data
Пример #3
0
def beautify_text(raw, syntax):
    from lxml import etree
    from calibre.ebooks.oeb.polish.parsing import parse
    from calibre.ebooks.oeb.polish.pretty import pretty_xml_tree, pretty_html_tree
    from calibre.ebooks.chardet import strip_encoding_declarations
    if syntax == 'xml':
        root = etree.fromstring(strip_encoding_declarations(raw))
        pretty_xml_tree(root)
    elif syntax == 'css':
        import logging
        from calibre.ebooks.oeb.base import serialize, _css_logger
        from calibre.ebooks.oeb.polish.utils import setup_css_parser_serialization
        from css_parser import CSSParser, log
        setup_css_parser_serialization(tprefs['editor_tab_stop_width'])
        log.setLevel(logging.WARN)
        log.raiseExceptions = False
        parser = CSSParser(loglevel=logging.WARNING,
                           # We dont care about @import rules
                           fetcher=lambda x: (None, None), log=_css_logger)
        data = parser.parseString(raw, href='<string>', validate=False)
        return serialize(data, 'text/css')
    else:
        root = parse(raw, line_numbers=False)
        pretty_html_tree(None, root)
    return etree.tostring(root, encoding='unicode')
Пример #4
0
def beautify_text(raw, syntax):
    from lxml import etree
    from calibre.ebooks.oeb.polish.parsing import parse
    from calibre.ebooks.oeb.polish.pretty import pretty_xml_tree, pretty_html_tree
    from calibre.ebooks.chardet import strip_encoding_declarations
    if syntax == 'xml':
        root = etree.fromstring(strip_encoding_declarations(raw))
        pretty_xml_tree(root)
    elif syntax == 'css':
        import logging
        from calibre.ebooks.oeb.base import serialize, _css_logger
        from calibre.ebooks.oeb.polish.utils import setup_css_parser_serialization
        from css_parser import CSSParser, log
        setup_css_parser_serialization(tprefs['editor_tab_stop_width'])
        log.setLevel(logging.WARN)
        log.raiseExceptions = False
        parser = CSSParser(loglevel=logging.WARNING,
                           # We dont care about @import rules
                           fetcher=lambda x: (None, None), log=_css_logger)
        data = parser.parseString(raw, href='<string>', validate=False)
        return serialize(data, 'text/css')
    else:
        root = parse(raw, line_numbers=False)
        pretty_html_tree(None, root)
    return etree.tostring(root, encoding=unicode)
Пример #5
0
def parse_css(data, fname='<string>', is_declaration=False, decode=None, log_level=None, css_preprocessor=None):
    if log_level is None:
        import logging
        log_level = logging.WARNING
    from css_parser import CSSParser, log
    from calibre.ebooks.oeb.base import _css_logger
    log.setLevel(log_level)
    log.raiseExceptions = False
    data = data or ''
    if isinstance(data, bytes):
        data = data.decode('utf-8') if decode is None else decode(data)
    if css_preprocessor is not None:
        data = css_preprocessor(data)
    parser = CSSParser(loglevel=log_level,
                        # We dont care about @import rules
                        fetcher=lambda x: (None, None), log=_css_logger)
    if is_declaration:
        data = parser.parseStyle(data, validate=False)
    else:
        data = parser.parseString(data, href=fname, validate=False)
    return data
Пример #6
0
    def extract_css(self, root, log):
        ans = []
        for s in root.xpath('//*[local-name() = "style" and @type="text/css"]'):
            ans.append(s.text)
            s.getparent().remove(s)

        head = root.xpath('//*[local-name() = "head"]')
        if head:
            head = head[0]
            ns = head.nsmap.get(None, '')
            if ns:
                ns = '{%s}'%ns
            etree.SubElement(head, ns+'link', {'type':'text/css',
                'rel':'stylesheet', 'href':'odfpy.css'})

        css = u'\n\n'.join(ans)
        parser = CSSParser(loglevel=logging.WARNING,
                            log=_css_logger)
        self.css = parser.parseString(css, validate=False)

        with open('odfpy.css', 'wb') as f:
            f.write(css.encode('utf-8'))
Пример #7
0
    def __init__(self,
                 tree,
                 path,
                 oeb,
                 opts,
                 profile=None,
                 extra_css='',
                 user_css='',
                 base_css=''):
        self.oeb, self.opts = oeb, opts
        self.profile = profile
        if self.profile is None:
            # Use the default profile. This should really be using
            # opts.output_profile, but I don't want to risk changing it, as
            # doing so might well have hard to debug font size effects.
            from calibre.customize.ui import output_profiles
            for x in output_profiles():
                if x.short_name == 'default':
                    self.profile = x
                    break
        if self.profile is None:
            # Just in case the default profile is removed in the future :)
            self.profile = opts.output_profile
        self.body_font_size = self.profile.fbase
        self.logger = oeb.logger
        item = oeb.manifest.hrefs[path]
        basename = os.path.basename(path)
        cssname = os.path.splitext(basename)[0] + '.css'
        stylesheets = [html_css_stylesheet()]
        if base_css:
            stylesheets.append(parseString(base_css, validate=False))
        style_tags = xpath(tree,
                           '//*[local-name()="style" or local-name()="link"]')

        # Add css_parser parsing profiles from output_profile
        for profile in self.opts.output_profile.extra_css_modules:
            cssprofiles.addProfile(profile['name'], profile['props'],
                                   profile['macros'])

        parser = CSSParser(fetcher=self._fetch_css_file,
                           log=logging.getLogger('calibre.css'))
        for elem in style_tags:
            if (elem.tag == XHTML('style')
                    and elem.get('type', CSS_MIME) in OEB_STYLES
                    and media_ok(elem.get('media'))):
                text = elem.text if elem.text else ''
                for x in elem:
                    t = getattr(x, 'text', None)
                    if t:
                        text += '\n\n' + force_unicode(t, 'utf-8')
                    t = getattr(x, 'tail', None)
                    if t:
                        text += '\n\n' + force_unicode(t, 'utf-8')
                if text:
                    text = oeb.css_preprocessor(text)
                    # We handle @import rules separately
                    parser.setFetcher(lambda x: ('utf-8', b''))
                    stylesheet = parser.parseString(text,
                                                    href=cssname,
                                                    validate=False)
                    parser.setFetcher(self._fetch_css_file)
                    for rule in stylesheet.cssRules:
                        if rule.type == rule.IMPORT_RULE:
                            ihref = item.abshref(rule.href)
                            if not media_ok(rule.media.mediaText):
                                continue
                            hrefs = self.oeb.manifest.hrefs
                            if ihref not in hrefs:
                                self.logger.warn(
                                    'Ignoring missing stylesheet in @import rule:',
                                    rule.href)
                                continue
                            sitem = hrefs[ihref]
                            if sitem.media_type not in OEB_STYLES:
                                self.logger.warn(
                                    'CSS @import of non-CSS file %r' %
                                    rule.href)
                                continue
                            stylesheets.append(sitem.data)
                    # Make links to resources absolute, since these rules will
                    # be folded into a stylesheet at the root
                    replaceUrls(stylesheet,
                                item.abshref,
                                ignoreImportRules=True)
                    stylesheets.append(stylesheet)
            elif (elem.tag == XHTML('link') and elem.get('href')
                  and elem.get('rel', 'stylesheet').lower() == 'stylesheet'
                  and elem.get('type', CSS_MIME).lower() in OEB_STYLES
                  and media_ok(elem.get('media'))):
                href = urlnormalize(elem.attrib['href'])
                path = item.abshref(href)
                sitem = oeb.manifest.hrefs.get(path, None)
                if sitem is None:
                    self.logger.warn(
                        'Stylesheet %r referenced by file %r not in manifest' %
                        (path, item.href))
                    continue
                if not hasattr(sitem.data, 'cssRules'):
                    self.logger.warn(
                        'Stylesheet %r referenced by file %r is not CSS' %
                        (path, item.href))
                    continue
                stylesheets.append(sitem.data)
        csses = {'extra_css': extra_css, 'user_css': user_css}
        for w, x in csses.items():
            if x:
                try:
                    text = x
                    stylesheet = parser.parseString(text,
                                                    href=cssname,
                                                    validate=False)
                    stylesheets.append(stylesheet)
                except Exception:
                    self.logger.exception('Failed to parse %s, ignoring.' % w)
                    self.logger.debug('Bad css: ')
                    self.logger.debug(x)

        # using oeb to store the rules, page rule and font face rules
        # and generating them again if opts, profile or stylesheets are different
        if (not hasattr(self.oeb, 'stylizer_rules')) \
            or not self.oeb.stylizer_rules.same_rules(self.opts, self.profile, stylesheets):
            self.oeb.stylizer_rules = StylizerRules(self.opts, self.profile,
                                                    stylesheets)
        self.rules = self.oeb.stylizer_rules.rules
        self.page_rule = self.oeb.stylizer_rules.page_rule
        self.font_face_rules = self.oeb.stylizer_rules.font_face_rules
        self.flatten_style = self.oeb.stylizer_rules.flatten_style

        self._styles = {}
        pseudo_pat = re.compile(
            ':{1,2}(%s)' % ('|'.join(INAPPROPRIATE_PSEUDO_CLASSES)), re.I)
        select = Select(tree, ignore_inappropriate_pseudo_classes=True)

        for _, _, cssdict, text, _ in self.rules:
            fl = pseudo_pat.search(text)
            try:
                matches = tuple(select(text))
            except SelectorError as err:
                self.logger.error(
                    f'Ignoring CSS rule with invalid selector: {text!r} ({as_unicode(err)})'
                )
                continue

            if fl is not None:
                fl = fl.group(1)
                if fl == 'first-letter' and getattr(
                        self.oeb, 'plumber_output_format',
                        '').lower() in {'mobi', 'docx'}:
                    # Fake first-letter
                    for elem in matches:
                        for x in elem.iter('*'):
                            if x.text:
                                punctuation_chars = []
                                text = str(x.text)
                                while text:
                                    category = unicodedata.category(text[0])
                                    if category[0] not in {'P', 'Z'}:
                                        break
                                    punctuation_chars.append(text[0])
                                    text = text[1:]

                                special_text = ''.join(punctuation_chars) + \
                                        (text[0] if text else '')
                                span = x.makeelement('{%s}span' % XHTML_NS)
                                span.text = special_text
                                span.set('data-fake-first-letter', '1')
                                span.tail = text[1:]
                                x.text = None
                                x.insert(0, span)
                                self.style(span)._update_cssdict(cssdict)
                                break
                else:  # Element pseudo-class
                    for elem in matches:
                        self.style(elem)._update_pseudo_class(fl, cssdict)
            else:
                for elem in matches:
                    self.style(elem)._update_cssdict(cssdict)
        for elem in xpath(tree, '//h:*[@style]'):
            self.style(elem)._apply_style_attr(url_replacer=item.abshref)
        num_pat = re.compile(r'[0-9.]+$')
        for elem in xpath(tree, '//h:img[@width or @height]'):
            style = self.style(elem)
            # Check if either height or width is not default
            is_styled = style._style.get('width', 'auto') != 'auto' or \
                    style._style.get('height', 'auto') != 'auto'
            if not is_styled:
                # Update img style dimension using width and height
                upd = {}
                for prop in ('width', 'height'):
                    val = elem.get(prop, '').strip()
                    try:
                        del elem.attrib[prop]
                    except:
                        pass
                    if val:
                        if num_pat.match(val) is not None:
                            val += 'px'
                        upd[prop] = val
                if upd:
                    style._update_cssdict(upd)
Пример #8
0
def safe_parser():
    global _safe_parser
    if _safe_parser is None:
        import logging
        _safe_parser = CSSParser(loglevel=logging.CRITICAL, validate=False)
    return _safe_parser
Пример #9
0
    def __init__(self, tree, path, oeb, opts, profile=None,
            extra_css='', user_css='', base_css=''):
        self.oeb, self.opts = oeb, opts
        self.profile = profile
        if self.profile is None:
            # Use the default profile. This should really be using
            # opts.output_profile, but I don't want to risk changing it, as
            # doing so might well have hard to debug font size effects.
            from calibre.customize.ui import output_profiles
            for x in output_profiles():
                if x.short_name == 'default':
                    self.profile = x
                    break
        if self.profile is None:
            # Just in case the default profile is removed in the future :)
            self.profile = opts.output_profile
        self.body_font_size = self.profile.fbase
        self.logger = oeb.logger
        item = oeb.manifest.hrefs[path]
        basename = os.path.basename(path)
        cssname = os.path.splitext(basename)[0] + '.css'
        stylesheets = [html_css_stylesheet()]
        if base_css:
            stylesheets.append(parseString(base_css, validate=False))
        style_tags = xpath(tree, '//*[local-name()="style" or local-name()="link"]')

        # Add css_parser parsing profiles from output_profile
        for profile in self.opts.output_profile.extra_css_modules:
            cssprofiles.addProfile(profile['name'],
                                        profile['props'],
                                        profile['macros'])

        parser = CSSParser(fetcher=self._fetch_css_file,
                log=logging.getLogger('calibre.css'))
        self.font_face_rules = []
        for elem in style_tags:
            if (elem.tag == XHTML('style') and elem.get('type', CSS_MIME) in OEB_STYLES and media_ok(elem.get('media'))):
                text = elem.text if elem.text else u''
                for x in elem:
                    t = getattr(x, 'text', None)
                    if t:
                        text += u'\n\n' + force_unicode(t, u'utf-8')
                    t = getattr(x, 'tail', None)
                    if t:
                        text += u'\n\n' + force_unicode(t, u'utf-8')
                if text:
                    text = oeb.css_preprocessor(text)
                    # We handle @import rules separately
                    parser.setFetcher(lambda x: ('utf-8', b''))
                    stylesheet = parser.parseString(text, href=cssname,
                            validate=False)
                    parser.setFetcher(self._fetch_css_file)
                    for rule in stylesheet.cssRules:
                        if rule.type == rule.IMPORT_RULE:
                            ihref = item.abshref(rule.href)
                            if not media_ok(rule.media.mediaText):
                                continue
                            hrefs = self.oeb.manifest.hrefs
                            if ihref not in hrefs:
                                self.logger.warn('Ignoring missing stylesheet in @import rule:', rule.href)
                                continue
                            sitem = hrefs[ihref]
                            if sitem.media_type not in OEB_STYLES:
                                self.logger.warn('CSS @import of non-CSS file %r' % rule.href)
                                continue
                            stylesheets.append(sitem.data)
                    # Make links to resources absolute, since these rules will
                    # be folded into a stylesheet at the root
                    replaceUrls(stylesheet, item.abshref,
                            ignoreImportRules=True)
                    stylesheets.append(stylesheet)
            elif (elem.tag == XHTML('link') and elem.get('href') and elem.get(
                    'rel', 'stylesheet').lower() == 'stylesheet' and elem.get(
                    'type', CSS_MIME).lower() in OEB_STYLES and media_ok(elem.get('media'))
                ):
                href = urlnormalize(elem.attrib['href'])
                path = item.abshref(href)
                sitem = oeb.manifest.hrefs.get(path, None)
                if sitem is None:
                    self.logger.warn(
                        'Stylesheet %r referenced by file %r not in manifest' %
                        (path, item.href))
                    continue
                if not hasattr(sitem.data, 'cssRules'):
                    self.logger.warn(
                    'Stylesheet %r referenced by file %r is not CSS'%(path,
                        item.href))
                    continue
                stylesheets.append(sitem.data)
        csses = {'extra_css':extra_css, 'user_css':user_css}
        for w, x in csses.items():
            if x:
                try:
                    text = x
                    stylesheet = parser.parseString(text, href=cssname,
                            validate=False)
                    stylesheets.append(stylesheet)
                except:
                    self.logger.exception('Failed to parse %s, ignoring.'%w)
                    self.logger.debug('Bad css: ')
                    self.logger.debug(x)
        rules = []
        index = 0
        self.stylesheets = set()
        self.page_rule = {}
        for sheet_index, stylesheet in enumerate(stylesheets):
            href = stylesheet.href
            self.stylesheets.add(href)
            for rule in stylesheet.cssRules:
                if rule.type == rule.MEDIA_RULE:
                    if media_ok(rule.media.mediaText):
                        for subrule in rule.cssRules:
                            rules.extend(self.flatten_rule(subrule, href, index, is_user_agent_sheet=sheet_index==0))
                            index += 1
                else:
                    rules.extend(self.flatten_rule(rule, href, index, is_user_agent_sheet=sheet_index==0))
                    index = index + 1
        rules.sort(key=itemgetter(0))  # sort by specificity
        self.rules = rules
        self._styles = {}
        pseudo_pat = re.compile(u':{1,2}(%s)' % ('|'.join(INAPPROPRIATE_PSEUDO_CLASSES)), re.I)
        select = Select(tree, ignore_inappropriate_pseudo_classes=True)

        for _, _, cssdict, text, _ in rules:
            fl = pseudo_pat.search(text)
            try:
                matches = tuple(select(text))
            except SelectorError as err:
                self.logger.error('Ignoring CSS rule with invalid selector: %r (%s)' % (text, as_unicode(err)))
                continue

            if fl is not None:
                fl = fl.group(1)
                if fl == 'first-letter' and getattr(self.oeb,
                        'plumber_output_format', '').lower() in {u'mobi', u'docx'}:
                    # Fake first-letter
                    for elem in matches:
                        for x in elem.iter('*'):
                            if x.text:
                                punctuation_chars = []
                                text = unicode_type(x.text)
                                while text:
                                    category = unicodedata.category(text[0])
                                    if category[0] not in {'P', 'Z'}:
                                        break
                                    punctuation_chars.append(text[0])
                                    text = text[1:]

                                special_text = u''.join(punctuation_chars) + \
                                        (text[0] if text else u'')
                                span = x.makeelement('{%s}span' % XHTML_NS)
                                span.text = special_text
                                span.set('data-fake-first-letter', '1')
                                span.tail = text[1:]
                                x.text = None
                                x.insert(0, span)
                                self.style(span)._update_cssdict(cssdict)
                                break
                else:  # Element pseudo-class
                    for elem in matches:
                        self.style(elem)._update_pseudo_class(fl, cssdict)
            else:
                for elem in matches:
                    self.style(elem)._update_cssdict(cssdict)
        for elem in xpath(tree, '//h:*[@style]'):
            self.style(elem)._apply_style_attr(url_replacer=item.abshref)
        num_pat = re.compile(r'[0-9.]+$')
        for elem in xpath(tree, '//h:img[@width or @height]'):
            style = self.style(elem)
            # Check if either height or width is not default
            is_styled = style._style.get('width', 'auto') != 'auto' or \
                    style._style.get('height', 'auto') != 'auto'
            if not is_styled:
                # Update img style dimension using width and height
                upd = {}
                for prop in ('width', 'height'):
                    val = elem.get(prop, '').strip()
                    try:
                        del elem.attrib[prop]
                    except:
                        pass
                    if val:
                        if num_pat.match(val) is not None:
                            val += 'px'
                        upd[prop] = val
                if upd:
                    style._update_cssdict(upd)