Exemplo n.º 1
0
    def find_page_breaks(self, item):
        if self.page_break_selectors is None:
            self.page_break_selectors = set()
            stylesheets = [x.data for x in self.oeb.manifest if x.media_type in
                    OEB_STYLES]
            for rule in rules(stylesheets):
                before = force_unicode(getattr(rule.style.getPropertyCSSValue(
                    'page-break-before'), 'cssText', '').strip().lower())
                after  = force_unicode(getattr(rule.style.getPropertyCSSValue(
                    'page-break-after'), 'cssText', '').strip().lower())
                try:
                    if before and before not in {'avoid', 'auto', 'inherit'}:
                        self.page_break_selectors.add((rule.selectorText, True))
                        if self.remove_css_pagebreaks:
                            rule.style.removeProperty('page-break-before')
                except:
                    pass
                try:
                    if after and after not in {'avoid', 'auto', 'inherit'}:
                        self.page_break_selectors.add((rule.selectorText, False))
                        if self.remove_css_pagebreaks:
                            rule.style.removeProperty('page-break-after')
                except:
                    pass
        page_breaks = set()
        select = Select(item.data)
        if not self.page_break_selectors:
            return [], []
        body = item.data.xpath('//h:body', namespaces=NAMESPACES)
        if not body:
            return [], []
        descendants = frozenset(body[0].iterdescendants('*'))

        for selector, before in self.page_break_selectors:
            try:
                for elem in select(selector):
                    if elem in descendants and elem.tag.rpartition('}')[2].lower() not in {'html', 'body', 'head', 'style', 'script', 'meta', 'link'}:
                        elem.set('pb_before', '1' if before else '0')
                        page_breaks.add(elem)
            except SelectorError as err:
                self.log.warn('Ignoring page breaks specified with invalid CSS selector: %r (%s)' % (selector, as_unicode(err)))

        for i, elem in enumerate(item.data.iter('*')):
            try:
                elem.set('pb_order', str(i))
            except TypeError:  # Cant set attributes on comment nodes etc.
                continue

        page_breaks = list(page_breaks)
        page_breaks.sort(key=lambda x:int(x.get('pb_order')))
        page_break_ids, page_breaks_ = [], []
        for i, x in enumerate(page_breaks):
            x.set('id', x.get('id', 'calibre_pb_%d'%i))
            id = x.get('id')
            try:
                xp = XPath('//*[@id="%s"]'%id)
            except:
                try:
                    xp = XPath("//*[@id='%s']"%id)
                except:
                    # The id has both a quote and an apostrophe or some other
                    # Just replace it since I doubt its going to work anywhere else
                    # either
                    id = 'calibre_pb_%d'%i
                    x.set('id', id)
                    xp = XPath('//*[@id=%r]'%id)
            page_breaks_.append((xp, x.get('pb_before', '0') == '1'))
            page_break_ids.append(id)

        for elem in item.data.iter(etree.Element):
            elem.attrib.pop('pb_order', False)
            elem.attrib.pop('pb_before', False)

        return page_breaks_, page_break_ids
Exemplo n.º 2
0
    def find_page_breaks(self, item):
        if self.page_break_selectors is None:
            from calibre.ebooks.oeb.stylizer import fix_namespace
            css_to_xpath = HTMLTranslator().css_to_xpath
            self.page_break_selectors = set([])
            stylesheets = [
                x.data for x in self.oeb.manifest if x.media_type in OEB_STYLES
            ]
            for rule in rules(stylesheets):
                before = getattr(
                    rule.style.getPropertyCSSValue('page-break-before'),
                    'cssText', '').strip().lower()
                after = getattr(
                    rule.style.getPropertyCSSValue('page-break-after'),
                    'cssText', '').strip().lower()
                try:
                    if before and before not in {'avoid', 'auto', 'inherit'}:
                        self.page_break_selectors.add((XPath(
                            fix_namespace(css_to_xpath(rule.selectorText))),
                                                       True))
                        if self.remove_css_pagebreaks:
                            rule.style.removeProperty('page-break-before')
                except:
                    pass
                try:
                    if after and after not in {'avoid', 'auto', 'inherit'}:
                        self.page_break_selectors.add((XPath(
                            fix_namespace(css_to_xpath(rule.selectorText))),
                                                       False))
                        if self.remove_css_pagebreaks:
                            rule.style.removeProperty('page-break-after')
                except:
                    pass
        page_breaks = set([])
        for selector, before in self.page_break_selectors:
            body = item.data.xpath('//h:body', namespaces=NAMESPACES)
            if not body:
                continue
            for elem in selector(body[0]):
                if elem not in body:
                    elem.set('pb_before', '1' if before else '0')
                    page_breaks.add(elem)

        for i, elem in enumerate(item.data.iter()):
            try:
                elem.set('pb_order', str(i))
            except TypeError:  # Cant set attributes on comment nodes etc.
                continue

        page_breaks = list(page_breaks)
        page_breaks.sort(key=lambda x: int(x.get('pb_order')))
        page_break_ids, page_breaks_ = [], []
        for i, x in enumerate(page_breaks):
            x.set('id', x.get('id', 'calibre_pb_%d' % i))
            id = x.get('id')
            try:
                xp = XPath('//*[@id="%s"]' % id)
            except:
                try:
                    xp = XPath("//*[@id='%s']" % id)
                except:
                    # The id has both a quote and an apostrophe or some other
                    # Just replace it since I doubt its going to work anywhere else
                    # either
                    id = 'calibre_pb_%d' % i
                    x.set('id', id)
                    xp = XPath('//*[@id=%r]' % id)
            page_breaks_.append((xp, x.get('pb_before', '0') == '1'))
            page_break_ids.append(id)

        for elem in item.data.iter():
            elem.attrib.pop('pb_order', False)
            elem.attrib.pop('pb_before', False)

        return page_breaks_, page_break_ids
Exemplo n.º 3
0
    def find_page_breaks(self, item):
        if self.page_break_selectors is None:
            self.page_break_selectors = set()
            stylesheets = [
                x.data for x in self.oeb.manifest if x.media_type in OEB_STYLES
            ]
            for rule in rules(stylesheets):
                before = getattr(
                    rule.style.getPropertyCSSValue('page-break-before'),
                    'cssText', '').strip().lower()
                after = getattr(
                    rule.style.getPropertyCSSValue('page-break-after'),
                    'cssText', '').strip().lower()
                try:
                    if before and before not in {'avoid', 'auto', 'inherit'}:
                        self.page_break_selectors.add(
                            (rule.selectorText, True))
                        if self.remove_css_pagebreaks:
                            rule.style.removeProperty('page-break-before')
                except:
                    pass
                try:
                    if after and after not in {'avoid', 'auto', 'inherit'}:
                        self.page_break_selectors.add(
                            (rule.selectorText, False))
                        if self.remove_css_pagebreaks:
                            rule.style.removeProperty('page-break-after')
                except:
                    pass
        page_breaks = set()
        select = Select(item.data)
        if not self.page_break_selectors:
            return [], []
        body = item.data.xpath('//h:body', namespaces=NAMESPACES)
        if not body:
            return [], []
        descendants = frozenset(body[0].iterdescendants('*'))

        for selector, before in self.page_break_selectors:
            try:
                for elem in select(selector):
                    if elem in descendants and elem.tag.rpartition(
                            '}')[2].lower() not in {
                                'html', 'body', 'head', 'style', 'script',
                                'meta', 'link'
                            }:
                        elem.set('pb_before', '1' if before else '0')
                        page_breaks.add(elem)
            except SelectorError as err:
                self.log.warn(
                    'Ignoring page breaks specified with invalid CSS selector: %r (%s)'
                    % (selector, as_unicode(err)))

        for i, elem in enumerate(item.data.iter('*')):
            try:
                elem.set('pb_order', str(i))
            except TypeError:  # Cant set attributes on comment nodes etc.
                continue

        page_breaks = list(page_breaks)
        page_breaks.sort(key=lambda x: int(x.get('pb_order')))
        page_break_ids, page_breaks_ = [], []
        for i, x in enumerate(page_breaks):
            x.set('id', x.get('id', 'calibre_pb_%d' % i))
            id = x.get('id')
            try:
                xp = XPath('//*[@id="%s"]' % id)
            except:
                try:
                    xp = XPath("//*[@id='%s']" % id)
                except:
                    # The id has both a quote and an apostrophe or some other
                    # Just replace it since I doubt its going to work anywhere else
                    # either
                    id = 'calibre_pb_%d' % i
                    x.set('id', id)
                    xp = XPath('//*[@id=%r]' % id)
            page_breaks_.append((xp, x.get('pb_before', '0') == '1'))
            page_break_ids.append(id)

        for elem in item.data.iter(etree.Element):
            elem.attrib.pop('pb_order', False)
            elem.attrib.pop('pb_before', False)

        return page_breaks_, page_break_ids
Exemplo n.º 4
0
    def find_page_breaks(self, item):
        if self.page_break_selectors is None:
            from calibre.ebooks.oeb.stylizer import fix_namespace
            css_to_xpath = HTMLTranslator().css_to_xpath
            self.page_break_selectors = set([])
            stylesheets = [x.data for x in self.oeb.manifest if x.media_type in
                    OEB_STYLES]
            for rule in rules(stylesheets):
                before = getattr(rule.style.getPropertyCSSValue(
                    'page-break-before'), 'cssText', '').strip().lower()
                after  = getattr(rule.style.getPropertyCSSValue(
                    'page-break-after'), 'cssText', '').strip().lower()
                try:
                    if before and before not in {'avoid', 'auto', 'inherit'}:
                        self.page_break_selectors.add((XPath(fix_namespace(css_to_xpath(rule.selectorText))),
                            True))
                        if self.remove_css_pagebreaks:
                            rule.style.removeProperty('page-break-before')
                except:
                    pass
                try:
                    if after and after not in {'avoid', 'auto', 'inherit'}:
                        self.page_break_selectors.add((XPath(fix_namespace(css_to_xpath(rule.selectorText))),
                            False))
                        if self.remove_css_pagebreaks:
                            rule.style.removeProperty('page-break-after')
                except:
                    pass
        page_breaks = set([])
        for selector, before in self.page_break_selectors:
            body = item.data.xpath('//h:body', namespaces=NAMESPACES)
            if not body:
                continue
            for elem in selector(body[0]):
                if elem not in body:
                    elem.set('pb_before', '1' if before else '0')
                    page_breaks.add(elem)

        for i, elem in enumerate(item.data.iter()):
            try:
                elem.set('pb_order', str(i))
            except TypeError:  # Cant set attributes on comment nodes etc.
                continue

        page_breaks = list(page_breaks)
        page_breaks.sort(key=lambda x:int(x.get('pb_order')))
        page_break_ids, page_breaks_ = [], []
        for i, x in enumerate(page_breaks):
            x.set('id', x.get('id', 'calibre_pb_%d'%i))
            id = x.get('id')
            try:
                xp = XPath('//*[@id="%s"]'%id)
            except:
                try:
                    xp = XPath("//*[@id='%s']"%id)
                except:
                    # The id has both a quote and an apostrophe or some other
                    # Just replace it since I doubt its going to work anywhere else
                    # either
                    id = 'calibre_pb_%d'%i
                    x.set('id', id)
                    xp = XPath('//*[@id=%r]'%id)
            page_breaks_.append((xp, x.get('pb_before', '0') == '1'))
            page_break_ids.append(id)

        for elem in item.data.iter():
            elem.attrib.pop('pb_order', False)
            elem.attrib.pop('pb_before', False)

        return page_breaks_, page_break_ids