def search(query, max_results=10, timeout=60, write_raw_to=None): url = 'https://www.gutenberg.org/ebooks/search/?query={}&submit_search=Search'.format( quote_plus(query)) counter = max_results br = browser() raw = br.open(url).read() if write_raw_to is not None: with open(write_raw_to, 'wb') as f: f.write(raw) root = parse(raw) CSSSelect = Select(root) for li in CSSSelect('li.booklink'): if counter <= 0: break counter -= 1 s = SearchResult() a = next(CSSSelect('a.link', li)) s.detail_item = absurl(a.get('href')) s.title = etree.tostring(next(CSSSelect('span.title', li)), method='text', encoding='unicode').strip() try: s.author = etree.tostring(next(CSSSelect('span.subtitle', li)), method='text', encoding='unicode').strip() except StopIteration: s.author = "" for img in CSSSelect('img.cover-thumb', li): s.cover_url = absurl(img.get('src')) break # Get the formats and direct download links. details_doc = parse(br.open_novisit(s.detail_item).read()) doc_select = Select(details_doc) for tr in doc_select('table.files tr[typeof="pgterms:file"]'): for a in doc_select('a.link', tr): href = a.get('href') type = a.get('type') ext = mimetypes.guess_extension( type.split(';')[0]) if type else None if href and ext: url = absurl(href.split('?')[0]) ext = ext[1:].upper().strip() if ext not in s.downloads: s.downloads[ext] = url break s.formats = ', '.join(s.downloads.keys()) if not s.formats: continue yield s
def search_kobo(query, max_results=10, timeout=60, write_html_to=None): from css_selectors import Select url = 'https://www.kobobooks.com/search/search.html?q=' + quote_plus(query) raw = read_url(url, timeout=timeout) if write_html_to is not None: with open(write_html_to, 'w') as f: f.write(raw) doc = html.fromstring(raw) select = Select(doc) for i, item in enumerate(select('.result-items .item-wrapper.book')): if i == max_results: break for img in select('.item-image img[src]', item): cover_url = img.get('src') if cover_url.startswith('//'): cover_url = 'https:' + cover_url break else: cover_url = None for p in select('h2.title', item): title = etree.tostring(p, method='text', encoding='unicode').strip() for a in select('a[href]', p): url = a.get('href') break else: url = None break else: title = None if title: for p in select('p.subtitle', item): title += ' - ' + etree.tostring( p, method='text', encoding='unicode').strip() authors = [] for a in select('.contributors a.contributor-name', item): authors.append( etree.tostring(a, method='text', encoding='unicode').strip()) authors = authors_to_string(authors) for p in select('p.price', item): price = etree.tostring(p, method='text', encoding='unicode').strip() break else: price = None if title and authors and url: s = SearchResult() s.cover_url = cover_url s.title = title s.author = authors s.price = price s.detail_item = url s.formats = 'EPUB' s.drm = SearchResult.DRM_UNKNOWN yield s
def get_details(self): if self.preparsed_root is None: raw, root, selector = parse_details_page(self.url, self.log, self.timeout, self.browser, self.domain) else: raw, root, selector = self.preparsed_root from css_selectors import Select self.selector = Select(root) self.parse_details(raw, root)
def parse_details_page(url, log, timeout, browser, domain): from calibre.utils.cleantext import clean_ascii_chars from calibre.ebooks.chardet import xml_to_unicode import html5lib from lxml.html import tostring try: raw = browser.open_novisit(url, timeout=timeout).read().strip() except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: log.error('URL malformed: %r'%url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = 'Amazon timed out. Try again later.' log.error(msg) else: msg = 'Failed to make details query: %r'%url log.exception(msg) return oraw = raw if 'amazon.com.br' in url: raw = raw.decode('utf-8') # amazon.com.br serves utf-8 but has an incorrect latin1 <meta> tag raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] if '<title>404 - ' in raw: log.error('URL malformed: %r'%url) return try: root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml', namespaceHTMLElements=False) except: msg = 'Failed to parse amazon details page: %r'%url log.exception(msg) return if domain == 'jp': for a in root.xpath('//a[@href]'): if 'black-curtain-redirect.html' in a.get('href'): url = 'http://amazon.co.jp'+a.get('href') log('Black curtain redirect found, following') return parse_details_page(url, log, timeout, browser, domain) errmsg = root.xpath('//*[@id="errorMessage"]') if errmsg: msg = 'Failed to parse amazon details page: %r'%url msg += tostring(errmsg, method='text', encoding=unicode).strip() log.error(msg) return from css_selectors import Select selector = Select(root) return oraw, root, selector
def parse_details_page(url, log, timeout, browser): from calibre.utils.cleantext import clean_ascii_chars from calibre.ebooks.chardet import xml_to_unicode import html5lib from lxml.html import tostring try: raw = browser.open_novisit( url, timeout=timeout).read().decode('gb18030').strip() except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: log.error('URL malformed: %r' % url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = 'Amazon timed out. Try again later.' log.error(msg) else: msg = 'Failed to make details query: %r' % url log.exception(msg) return oraw = raw raw = raw raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] if '<title>404 - ' in raw: log.error('URL malformed: %r' % url) return try: root = html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False) except: msg = 'Failed to parse amazon details page: %r' % url log.exception(msg) return errmsg = root.xpath('//*[@id="errorMessage"]') if errmsg: msg = 'Failed to parse amazon details page: %r' % url msg += tostring(errmsg, method='text', encoding=unicode).strip() log.error(msg) return from css_selectors import Select selector = Select(root) return oraw, root, selector
def find_first_matching_rule(container, html_file_name, raw_html, class_data, lnum_attr='data-lnum'): lnum, tags = class_data['sourceline_address'] class_name = class_data['class'] root = parse(raw_html, decoder=lambda x: x.decode('utf-8'), line_numbers=True, linenumber_attribute=lnum_attr) tags_on_line = root.xpath(f'//*[@{lnum_attr}={lnum}]') barenames = [barename(tag.tag) for tag in tags_on_line] if barenames[:len(tags)] != tags: raise NoMatchingTagFound( f'No tag matching the specification was found in {html_file_name}') target_elem = tags_on_line[len(tags) - 1] select = Select(root, ignore_inappropriate_pseudo_classes=True) for tag in root.iter('*'): tn = barename(tag.tag) if tn == 'style' and tag.text and tag.get('type', 'text/css') == 'text/css': try: sheet = container.parse_css(tag.text) except Exception: continue res = find_first_rule_that_matches_elem(container, target_elem, select, class_name, sheet.cssRules, html_file_name) if res is not None: return res._replace(style_tag_address=(int(tag.get(lnum_attr)), ['style'])) elif tn == 'link' and tag.get('href') and tag.get( 'rel') == 'stylesheet': sname = container.href_to_name(tag.get('href'), html_file_name) try: sheet = container.parsed(sname) except Exception: continue if not hasattr(sheet, 'cssRules'): continue res = find_first_rule_that_matches_elem(container, target_elem, select, class_name, sheet.cssRules, sname) if res is not None: return res raise NoMatchingRuleFound( f'No CSS rules that apply to the specified tag in {html_file_name} with the class {class_name} found' )
def parse(self, raw): from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.date import parse_only_date, UNDEFINED_DATE from css_selectors import Select root = parse_html(raw) selector = Select(root) sku = next(selector('div.sku.attGroup')) info = sku.getparent() top = info.getparent().getparent() banner = top.find('div') spans = banner.findall('span') title = '' for i, span in enumerate(spans): if i == 0 or '12pt' in span.get('style', ''): title += astext(span) else: break authors = [ re.sub(r'\(.*\)', '', x).strip() for x in astext(spans[-1]).split(',') ] mi = Metadata(title.strip(), authors) # Identifiers isbns = [check_isbn(x.strip()) for x in astext(sku).split(',')] for isbn in isbns: if isbn: self.plugin.cache_isbn_to_identifier(isbn, self.sku) isbns = sorted(isbns, key=lambda x: len(x) if x else 0, reverse=True) if isbns and isbns[0]: mi.isbn = isbns[0] mi.set_identifier('edelweiss', self.sku) # Tags bisac = tuple(selector('div.bisac.attGroup')) if bisac: bisac = astext(bisac[0]) mi.tags = [x.strip() for x in bisac.split(',')] mi.tags = [ t[1:].strip() if t.startswith('&') else t for t in mi.tags ] # Publisher pub = tuple(selector('div.supplier.attGroup')) if pub: pub = astext(pub[0]) mi.publisher = pub # Pubdate pub = tuple(selector('div.shipDate.attGroupItem')) if pub: pub = astext(pub[0]) parts = pub.partition(':')[0::2] pub = parts[1] or parts[0] try: if ', Ship Date:' in pub: pub = pub.partition(', Ship Date:')[0] q = parse_only_date(pub, assume_utc=True) if q.year != UNDEFINED_DATE: mi.pubdate = q except: self.log.exception('Error parsing published date: %r' % pub) # Comments comm = '' general = tuple(selector('div#pd-general-overview-content')) if general: q = self.render_comments(general[0]) if q != '<p>No title summary available. </p>': comm += q general = tuple(selector('div#pd-general-contributor-content')) if general: comm += self.render_comments(general[0]) general = tuple(selector('div#pd-general-quotes-content')) if general: comm += self.render_comments(general[0]) if comm: mi.comments = comm # Cover img = tuple(selector('img.title-image[src]')) if img: href = img[0].get('src').replace('jacket_covers/medium/', 'jacket_covers/flyout/') self.plugin.cache_identifier_to_cover_url(self.sku, href) mi.has_cover = self.plugin.cached_identifier_to_cover_url( self.sku) is not None return mi
def identify( self, log, result_queue, abort, title=None, authors=None, # {{{ identifiers={}, timeout=30): from urlparse import parse_qs book_url = self._get_book_url(identifiers.get('edelweiss', None)) br = self.browser if book_url: entries = [(book_url, identifiers['edelweiss'])] else: entries = [] query = self.create_query(log, title=title, authors=authors, identifiers=identifiers) if not query: log.error('Insufficient metadata to construct query') return log('Using query URL:', query) try: raw = br.open_novisit(query, timeout=timeout).read() except Exception as e: log.exception('Failed to make identify query: %r' % query) return as_unicode(e) try: root = parse_html(raw) except Exception as e: log.exception('Failed to parse identify results') return as_unicode(e) from css_selectors import Select select = Select(root) has_isbn = check_isbn(identifiers.get('isbn', None)) is not None if not has_isbn: author_tokens = set(x.lower() for x in self.get_author_tokens( authors, only_first_author=True)) for entry in select('div.listRow div.listRowMain'): a = entry.xpath( 'descendant::a[contains(@href, "sku=") and contains(@href, "productDetailPage.aspx")]' ) if not a: continue href = a[0].get('href') prefix, qs = href.partition('?')[0::2] sku = parse_qs(qs).get('sku', None) if sku and sku[0]: sku = sku[0] div = tuple(select('div.sku.attGroup')) if div: text = astext(div[0]) isbns = [ check_isbn(x.strip()) for x in text.split(',') ] for isbn in isbns: if isbn: self.cache_isbn_to_identifier(isbn, sku) for img in entry.xpath( 'descendant::img[contains(@src, "/jacket_covers/thumbnail/")]' ): self.cache_identifier_to_cover_url( sku, img.get('src').replace('/thumbnail/', '/flyout/')) div = tuple(select('div.format.attGroup')) text = astext(div[0]).lower() if 'audio' in text or 'mp3' in text: # Audio-book, ignore continue if not has_isbn: # edelweiss returns matches based only on title, so we # filter by author manually div = tuple(select('div.contributor.attGroup')) try: entry_authors = set( self.get_author_tokens([ x.strip() for x in astext(div[0]).lower().split(',') ])) except IndexError: entry_authors = set() if not entry_authors.issuperset(author_tokens): continue entries.append((self._get_book_url(sku), sku)) if (not entries and identifiers and title and authors and not abort.is_set()): return self.identify(log, result_queue, abort, title=title, authors=authors, timeout=timeout) if not entries: return workers = [ Worker(skul, url, i, result_queue, br.clone_browser(), timeout, log, self) for i, (url, skul) in enumerate(entries[:5]) ] for w in workers: w.start() # Don't send all requests at the same time time.sleep(0.1) while not abort.is_set(): a_worker_is_alive = False for w in workers: w.join(0.2) if abort.is_set(): break if w.is_alive(): a_worker_is_alive = True if not a_worker_is_alive: break
def find_page_breaks(self, item): if self.page_break_selectors is None: self.page_break_selectors = set() stylesheets = [ x.data for x in self.oeb.manifest if x.media_type in OEB_STYLES ] for rule in rules(stylesheets): before = getattr( rule.style.getPropertyCSSValue('page-break-before'), 'cssText', '').strip().lower() after = getattr( rule.style.getPropertyCSSValue('page-break-after'), 'cssText', '').strip().lower() try: if before and before not in {'avoid', 'auto', 'inherit'}: self.page_break_selectors.add( (rule.selectorText, True)) if self.remove_css_pagebreaks: rule.style.removeProperty('page-break-before') except: pass try: if after and after not in {'avoid', 'auto', 'inherit'}: self.page_break_selectors.add( (rule.selectorText, False)) if self.remove_css_pagebreaks: rule.style.removeProperty('page-break-after') except: pass page_breaks = set() select = Select(item.data) if not self.page_break_selectors: return [], [] body = item.data.xpath('//h:body', namespaces=NAMESPACES) if not body: return [], [] descendants = frozenset(body[0].iterdescendants('*')) for selector, before in self.page_break_selectors: try: for elem in select(selector): if elem in descendants and elem.tag.rpartition( '}')[2].lower() not in { 'html', 'body', 'head', 'style', 'script', 'meta', 'link' }: elem.set('pb_before', '1' if before else '0') page_breaks.add(elem) except SelectorError as err: self.log.warn( 'Ignoring page breaks specified with invalid CSS selector: %r (%s)' % (selector, as_unicode(err))) for i, elem in enumerate(item.data.iter('*')): try: elem.set('pb_order', str(i)) except TypeError: # Cant set attributes on comment nodes etc. continue page_breaks = list(page_breaks) page_breaks.sort(key=lambda x: int(x.get('pb_order'))) page_break_ids, page_breaks_ = [], [] for i, x in enumerate(page_breaks): x.set('id', x.get('id', 'calibre_pb_%d' % i)) id = x.get('id') try: xp = XPath('//*[@id="%s"]' % id) except: try: xp = XPath("//*[@id='%s']" % id) except: # The id has both a quote and an apostrophe or some other # Just replace it since I doubt its going to work anywhere else # either id = 'calibre_pb_%d' % i x.set('id', id) xp = XPath('//*[@id=%r]' % id) page_breaks_.append((xp, x.get('pb_before', '0') == '1')) page_break_ids.append(id) for elem in item.data.iter(etree.Element): elem.attrib.pop('pb_order', False) elem.attrib.pop('pb_before', False) return page_breaks_, page_break_ids
def css_data(container, book_locale, result_data, *args): import tinycss from tinycss.css21 import RuleSet, ImportRule def css_rules(file_name, rules, sourceline=0): ans = [] for rule in rules: if isinstance(rule, RuleSet): selector = rule.selector.as_css() ans.append( CSSRule( selector, RuleLocation(file_name, sourceline + rule.line, rule.column))) elif isinstance(rule, ImportRule): import_name = safe_href_to_name(container, rule.uri, file_name) if import_name and container.exists(import_name): ans.append(import_name) elif getattr(rule, 'rules', False): ans.extend(css_rules(file_name, rule.rules, sourceline)) return ans parser = tinycss.make_full_parser() importable_sheets = {} html_sheets = {} spine_names = {name for name, is_linear in container.spine_names} style_path, link_path = XPath('//h:style'), XPath('//h:link/@href') for name, mt in iteritems(container.mime_map): if mt in OEB_STYLES: importable_sheets[name] = css_rules( name, parser.parse_stylesheet(container.raw_data(name)).rules) elif mt in OEB_DOCS and name in spine_names: html_sheets[name] = [] for style in style_path(container.parsed(name)): if style.get('type', 'text/css') == 'text/css' and style.text: html_sheets[name].append( css_rules( name, parser.parse_stylesheet( force_unicode(style.text, 'utf-8')).rules, style.sourceline - 1)) rule_map = defaultdict(lambda: defaultdict(list)) def rules_in_sheet(sheet): for rule in sheet: if isinstance(rule, CSSRule): yield rule else: # @import rule isheet = importable_sheets.get(rule) if isheet is not None: for irule in rules_in_sheet(isheet): yield irule def sheets_for_html(name, root): for href in link_path(root): tname = safe_href_to_name(container, href, name) sheet = importable_sheets.get(tname) if sheet is not None: yield sheet tt_cache = {} def tag_text(elem): ans = tt_cache.get(elem) if ans is None: tag = elem.tag.rpartition('}')[-1] if elem.attrib: attribs = ' '.join( '%s="%s"' % (k, prepare_string_for_xml(elem.get(k, ''), True)) for k in elem.keys()) return '<%s %s>' % (tag, attribs) ans = tt_cache[elem] = '<%s>' % tag def matches_for_selector(selector, select, class_map, rule): lsel = selector.lower() try: matches = tuple(select(selector)) except SelectorError: return () for elem in matches: for cls in elem.get('class', '').split(): if '.' + cls.lower() in lsel: class_map[cls][elem].append(rule) return (MatchLocation(tag_text(elem), elem.sourceline) for elem in matches) class_map = defaultdict(lambda: defaultdict(list)) for name, inline_sheets in iteritems(html_sheets): root = container.parsed(name) cmap = defaultdict(lambda: defaultdict(list)) for elem in root.xpath('//*[@class]'): for cls in elem.get('class', '').split(): cmap[cls][elem] = [] select = Select(root, ignore_inappropriate_pseudo_classes=True) for sheet in chain(sheets_for_html(name, root), inline_sheets): for rule in rules_in_sheet(sheet): rule_map[rule][name].extend( matches_for_selector(rule.selector, select, cmap, rule)) for cls, elem_map in iteritems(cmap): class_elements = class_map[cls][name] for elem, usage in iteritems(elem_map): class_elements.append( ClassElement(name, elem.sourceline, elem.get('class'), tag_text(elem), tuple(usage))) result_data['classes'] = ans = [] for cls, name_map in iteritems(class_map): la = tuple( ClassFileMatch(name, tuple(class_elements), numeric_sort_key(name)) for name, class_elements in iteritems(name_map) if class_elements) num_of_matches = sum( sum(len(ce.matched_rules) for ce in cfm.class_elements) for cfm in la) ans.append(ClassEntry(cls, num_of_matches, la, numeric_sort_key(cls))) ans = [] for rule, loc_map in iteritems(rule_map): la = tuple( CSSFileMatch(name, tuple(locations), numeric_sort_key(name)) for name, locations in iteritems(loc_map) if locations) count = sum(len(fm.locations) for fm in la) ans.append(CSSEntry(rule, count, la, numeric_sort_key(rule.selector))) return ans
def __init__(self, tree, path, oeb, opts, profile=None, extra_css='', user_css='', base_css=''): self.oeb, self.opts = oeb, opts self.profile = profile if self.profile is None: # Use the default profile. This should really be using # opts.output_profile, but I don't want to risk changing it, as # doing so might well have hard to debug font size effects. from calibre.customize.ui import output_profiles for x in output_profiles(): if x.short_name == 'default': self.profile = x break if self.profile is None: # Just in case the default profile is removed in the future :) self.profile = opts.output_profile self.body_font_size = self.profile.fbase self.logger = oeb.logger item = oeb.manifest.hrefs[path] basename = os.path.basename(path) cssname = os.path.splitext(basename)[0] + '.css' stylesheets = [html_css_stylesheet()] if base_css: stylesheets.append(parseString(base_css, validate=False)) style_tags = xpath(tree, '//*[local-name()="style" or local-name()="link"]') # Add cssutils parsing profiles from output_profile for profile in self.opts.output_profile.extra_css_modules: cssprofiles.addProfile(profile['name'], profile['props'], profile['macros']) parser = CSSParser(fetcher=self._fetch_css_file, log=logging.getLogger('calibre.css')) self.font_face_rules = [] for elem in style_tags: if (elem.tag == XHTML('style') and elem.get('type', CSS_MIME) in OEB_STYLES): text = elem.text if elem.text else u'' for x in elem: t = getattr(x, 'text', None) if t: text += u'\n\n' + force_unicode(t, u'utf-8') t = getattr(x, 'tail', None) if t: text += u'\n\n' + force_unicode(t, u'utf-8') if text: text = oeb.css_preprocessor(text) # We handle @import rules separately parser.setFetcher(lambda x: ('utf-8', b'')) stylesheet = parser.parseString(text, href=cssname, validate=False) parser.setFetcher(self._fetch_css_file) for rule in stylesheet.cssRules: if rule.type == rule.IMPORT_RULE: ihref = item.abshref(rule.href) if rule.media.mediaText == 'amzn-mobi': continue hrefs = self.oeb.manifest.hrefs if ihref not in hrefs: self.logger.warn('Ignoring missing stylesheet in @import rule:', rule.href) continue sitem = hrefs[ihref] if sitem.media_type not in OEB_STYLES: self.logger.warn('CSS @import of non-CSS file %r' % rule.href) continue stylesheets.append(sitem.data) for rule in tuple(stylesheet.cssRules.rulesOfType(CSSRule.PAGE_RULE)): stylesheet.cssRules.remove(rule) # Make links to resources absolute, since these rules will # be folded into a stylesheet at the root replaceUrls(stylesheet, item.abshref, ignoreImportRules=True) stylesheets.append(stylesheet) elif elem.tag == XHTML('link') and elem.get('href') \ and elem.get('rel', 'stylesheet').lower() == 'stylesheet' \ and elem.get('type', CSS_MIME).lower() in OEB_STYLES: href = urlnormalize(elem.attrib['href']) path = item.abshref(href) sitem = oeb.manifest.hrefs.get(path, None) if sitem is None: self.logger.warn( 'Stylesheet %r referenced by file %r not in manifest' % (path, item.href)) continue if not hasattr(sitem.data, 'cssRules'): self.logger.warn( 'Stylesheet %r referenced by file %r is not CSS'%(path, item.href)) continue stylesheets.append(sitem.data) csses = {'extra_css':extra_css, 'user_css':user_css} for w, x in csses.items(): if x: try: text = x stylesheet = parser.parseString(text, href=cssname, validate=False) stylesheets.append(stylesheet) except: self.logger.exception('Failed to parse %s, ignoring.'%w) self.logger.debug('Bad css: ') self.logger.debug(x) rules = [] index = 0 self.stylesheets = set() self.page_rule = {} for sheet_index, stylesheet in enumerate(stylesheets): href = stylesheet.href self.stylesheets.add(href) for rule in stylesheet.cssRules: if rule.type == rule.MEDIA_RULE: media = {rule.media.item(i) for i in xrange(rule.media.length)} if not media.intersection({'all', 'screen', 'amzn-kf8'}): continue for subrule in rule.cssRules: rules.extend(self.flatten_rule(subrule, href, index, is_user_agent_sheet=sheet_index==0)) index += 1 else: rules.extend(self.flatten_rule(rule, href, index, is_user_agent_sheet=sheet_index==0)) index = index + 1 rules.sort() self.rules = rules self._styles = {} pseudo_pat = re.compile(ur':{1,2}(%s)' % ('|'.join(INAPPROPRIATE_PSEUDO_CLASSES)), re.I) select = Select(tree, ignore_inappropriate_pseudo_classes=True) for _, _, cssdict, text, _ in rules: fl = pseudo_pat.search(text) try: matches = tuple(select(text)) except SelectorError as err: self.logger.error('Ignoring CSS rule with invalid selector: %r (%s)' % (text, as_unicode(err))) continue if fl is not None: fl = fl.group(1) if fl == 'first-letter' and getattr(self.oeb, 'plumber_output_format', '').lower() in {u'mobi', u'docx'}: # Fake first-letter from lxml.builder import ElementMaker E = ElementMaker(namespace=XHTML_NS) for elem in matches: for x in elem.iter('*'): if x.text: punctuation_chars = [] text = unicode(x.text) while text: category = unicodedata.category(text[0]) if category[0] not in {'P', 'Z'}: break punctuation_chars.append(text[0]) text = text[1:] special_text = u''.join(punctuation_chars) + \ (text[0] if text else u'') span = E.span(special_text) span.set('data-fake-first-letter', '1') span.tail = text[1:] x.text = None x.insert(0, span) self.style(span)._update_cssdict(cssdict) break else: # Element pseudo-class for elem in matches: self.style(elem)._update_pseudo_class(fl, cssdict) else: for elem in matches: self.style(elem)._update_cssdict(cssdict) for elem in xpath(tree, '//h:*[@style]'): self.style(elem)._apply_style_attr(url_replacer=item.abshref) num_pat = re.compile(r'[0-9.]+$') for elem in xpath(tree, '//h:img[@width or @height]'): style = self.style(elem) # Check if either height or width is not default is_styled = style._style.get('width', 'auto') != 'auto' or \ style._style.get('height', 'auto') != 'auto' if not is_styled: # Update img style dimension using width and height upd = {} for prop in ('width', 'height'): val = elem.get(prop, '').strip() try: del elem.attrib[prop] except: pass if val: if num_pat.match(val) is not None: val += 'px' upd[prop] = val if upd: style._update_cssdict(upd)
def remove_unused_css(container, report=None, remove_unused_classes=False, merge_rules=False, merge_rules_with_identical_properties=False): ''' Remove all unused CSS rules from the book. An unused CSS rule is one that does not match any actual content. :param report: An optional callable that takes a single argument. It is called with information about the operations being performed. :param remove_unused_classes: If True, class attributes in the HTML that do not match any CSS rules are also removed. :param merge_rules: If True, rules with identical selectors are merged. ''' report = report or (lambda x: x) def safe_parse(name): try: return container.parsed(name) except TypeError: pass sheets = { name: safe_parse(name) for name, mt in iteritems(container.mime_map) if mt in OEB_STYLES } sheets = {k: v for k, v in iteritems(sheets) if v is not None} num_merged = num_rules_merged = 0 if merge_rules: for name, sheet in iteritems(sheets): num = merge_identical_selectors(sheet) if num: container.dirty(name) num_merged += num if merge_rules_with_identical_properties: for name, sheet in iteritems(sheets): num = merge_identical_properties(sheet) if num: container.dirty(name) num_rules_merged += num import_map = { name: get_imported_sheets(name, container, sheets) for name in sheets } if remove_unused_classes: class_map = { name: {icu_lower(x) for x in classes_in_rule_list(sheet.cssRules)} for name, sheet in iteritems(sheets) } style_rules = { name: tuple(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE)) for name, sheet in iteritems(sheets) } removal_stats = {'rules': 0, 'selectors': 0} num_of_removed_classes = 0 for name, mt in iteritems(container.mime_map): if mt not in OEB_DOCS: continue root = container.parsed(name) select = Select(root, ignore_inappropriate_pseudo_classes=True) used_classes = set() for style in root.xpath('//*[local-name()="style"]'): if style.get('type', 'text/css') == 'text/css' and style.text: sheet = container.parse_css(style.text) if merge_rules: num = merge_identical_selectors(sheet) if num: num_merged += num container.dirty(name) if merge_rules_with_identical_properties: num = merge_identical_properties(sheet) if num: num_rules_merged += num container.dirty(name) if remove_unused_classes: used_classes |= { icu_lower(x) for x in classes_in_rule_list(sheet.cssRules) } imports = get_imported_sheets(name, container, sheets, sheet=sheet) for imported_sheet in imports: mark_used_selectors(style_rules[imported_sheet], container.log, select) if remove_unused_classes: used_classes |= class_map[imported_sheet] rules = tuple(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE)) if mark_used_selectors(rules, container.log, select): remove_unused_selectors_and_rules(sheet.cssRules, rules, removal_stats) style.text = force_unicode(sheet.cssText, 'utf-8') pretty_script_or_style(container, style) container.dirty(name) for link in root.xpath('//*[local-name()="link" and @href]'): sname = container.href_to_name(link.get('href'), name) if sname not in sheets: continue mark_used_selectors(style_rules[sname], container.log, select) if remove_unused_classes: used_classes |= class_map[sname] for iname in import_map[sname]: mark_used_selectors(style_rules[iname], container.log, select) if remove_unused_classes: used_classes |= class_map[iname] if remove_unused_classes: for elem in root.xpath('//*[@class]'): original_classes, classes = elem.get('class', '').split(), [] for x in original_classes: if icu_lower(x) in used_classes: classes.append(x) if len(classes) != len(original_classes): if classes: elem.set('class', ' '.join(classes)) else: del elem.attrib['class'] num_of_removed_classes += len(original_classes) - len( classes) container.dirty(name) for name, sheet in iteritems(sheets): any_found = remove_unused_selectors_and_rules(sheet.cssRules, style_rules[name], removal_stats) if any_found: container.dirty(name) num_changes = num_merged + num_of_removed_classes + num_rules_merged + removal_stats[ 'rules'] + removal_stats['selectors'] if num_changes > 0: if removal_stats['rules']: report( ngettext('Removed one unused CSS style rule', 'Removed {} unused CSS style rules', removal_stats['rules']).format( removal_stats['rules'])) if removal_stats['selectors']: report( ngettext('Removed one unused CSS selector', 'Removed {} unused CSS selectors', removal_stats['selectors']).format( removal_stats['selectors'])) if num_of_removed_classes > 0: report( ngettext( 'Removed one unused class from the HTML', 'Removed {} unused classes from the HTML', num_of_removed_classes).format(num_of_removed_classes)) if num_merged > 0: report( ngettext('Merged one CSS style rule with identical selectors', 'Merged {} CSS style rules with identical selectors', num_merged).format(num_merged)) if num_rules_merged > 0: report( ngettext( 'Merged one CSS style rule with identical properties', 'Merged {} CSS style rules with identical properties', num_rules_merged).format(num_rules_merged)) if not removal_stats['rules']: report(_('No unused CSS style rules found')) if not removal_stats['selectors']: report(_('No unused CSS selectors found')) if remove_unused_classes and num_of_removed_classes == 0: report(_('No unused class attributes found')) if merge_rules and num_merged == 0: report(_('No style rules that could be merged found')) return num_changes > 0
def resolve_styles(container, name, select=None, sheet_callback=None): root = container.parsed(name) select = select or Select(root, ignore_inappropriate_pseudo_classes=True) style_map = defaultdict(list) pseudo_style_map = defaultdict(list) rule_index_counter = count() pseudo_pat = re.compile( u':{1,2}(%s)' % ('|'.join(INAPPROPRIATE_PSEUDO_CLASSES)), re.I) def process_sheet(sheet, sheet_name): if sheet_callback is not None: sheet_callback(sheet, sheet_name) for rule, sheet_name, rule_index in iterrules( container, sheet_name, rules=sheet, rule_index_counter=rule_index_counter, rule_type='STYLE_RULE'): for selector in rule.selectorList: text = selector.selectorText try: matches = tuple(select(text)) except SelectorError as err: container.log.error( 'Ignoring CSS rule with invalid selector: %r (%s)' % (text, as_unicode(err))) continue m = pseudo_pat.search(text) style = normalize_style_declaration(rule.style, sheet_name) if m is None: for elem in matches: style_map[elem].append( StyleDeclaration(specificity(rule_index, selector), style, None)) else: for elem in matches: pseudo_style_map[elem].append( StyleDeclaration(specificity(rule_index, selector), style, m.group(1))) process_sheet(html_css_stylesheet(container), 'user-agent.css') for elem in root.iterdescendants(XHTML('style'), XHTML('link')): if elem.tag.lower().endswith('style'): if not elem.text: continue sheet = container.parse_css(elem.text) sheet_name = name else: if (elem.get('type') or 'text/css').lower() not in OEB_STYLES or \ (elem.get('rel') or 'stylesheet').lower() != 'stylesheet' or \ not media_ok(elem.get('media')): continue href = elem.get('href') if not href: continue sheet_name = container.href_to_name(href, name) if not container.has_name(sheet_name): continue sheet = container.parsed(sheet_name) if not isinstance(sheet, CSSStyleSheet): continue process_sheet(sheet, sheet_name) for elem in root.xpath('//*[@style]'): text = elem.get('style') if text: style = container.parse_css(text, is_declaration=True) style_map[elem].append( StyleDeclaration(Specificity(1, 0, 0, 0, 0), normalize_style_declaration(style, name), None)) for l in (style_map, pseudo_style_map): for x in l.itervalues(): x.sort(key=itemgetter(0), reverse=True) style_map = { elem: resolve_declarations(x) for elem, x in style_map.iteritems() } pseudo_style_map = { elem: resolve_pseudo_declarations(x) for elem, x in pseudo_style_map.iteritems() } return partial(resolve_property, style_map), partial(resolve_pseudo_property, style_map, pseudo_style_map), select
def remove_unused_css(container, report=None, remove_unused_classes=False): ''' Remove all unused CSS rules from the book. An unused CSS rule is one that does not match any actual content. :param report: An optional callable that takes a single argument. It is called with information about the operations being performed. :param remove_unused_classes: If True, class attributes in the HTML that do not match any CSS rules are also removed. ''' report = report or (lambda x: x) def safe_parse(name): try: return container.parsed(name) except TypeError: pass sheets = { name: safe_parse(name) for name, mt in container.mime_map.iteritems() if mt in OEB_STYLES } sheets = {k: v for k, v in sheets.iteritems() if v is not None} import_map = { name: get_imported_sheets(name, container, sheets) for name in sheets } if remove_unused_classes: class_map = { name: {icu_lower(x) for x in classes_in_rule_list(sheet.cssRules)} for name, sheet in sheets.iteritems() } style_rules = { name: tuple(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE)) for name, sheet in sheets.iteritems() } num_of_removed_rules = num_of_removed_classes = 0 for name, mt in container.mime_map.iteritems(): if mt not in OEB_DOCS: continue root = container.parsed(name) select = Select(root, ignore_inappropriate_pseudo_classes=True) used_classes = set() for style in root.xpath('//*[local-name()="style"]'): if style.get('type', 'text/css') == 'text/css' and style.text: sheet = container.parse_css(style.text) if remove_unused_classes: used_classes |= { icu_lower(x) for x in classes_in_rule_list(sheet.cssRules) } imports = get_imported_sheets(name, container, sheets, sheet=sheet) for imported_sheet in imports: style_rules[imported_sheet] = tuple( filter_used_rules(style_rules[imported_sheet], container.log, select)) if remove_unused_classes: used_classes |= class_map[imported_sheet] rules = tuple(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE)) unused_rules = tuple( filter_used_rules(rules, container.log, select)) if unused_rules: num_of_removed_rules += len(unused_rules) [sheet.cssRules.remove(r) for r in unused_rules] style.text = force_unicode(sheet.cssText, 'utf-8') pretty_script_or_style(container, style) container.dirty(name) for link in root.xpath('//*[local-name()="link" and @href]'): sname = container.href_to_name(link.get('href'), name) if sname not in sheets: continue style_rules[sname] = tuple( filter_used_rules(style_rules[sname], container.log, select)) if remove_unused_classes: used_classes |= class_map[sname] for iname in import_map[sname]: style_rules[iname] = tuple( filter_used_rules(style_rules[iname], container.log, select)) if remove_unused_classes: used_classes |= class_map[iname] if remove_unused_classes: for elem in root.xpath('//*[@class]'): original_classes, classes = elem.get('class', '').split(), [] for x in original_classes: if icu_lower(x) in used_classes: classes.append(x) if len(classes) != len(original_classes): if classes: elem.set('class', ' '.join(classes)) else: del elem.attrib['class'] num_of_removed_classes += len(original_classes) - len( classes) container.dirty(name) for name, sheet in sheets.iteritems(): unused_rules = style_rules[name] if unused_rules: num_of_removed_rules += len(unused_rules) [sheet.cssRules.remove(r) for r in unused_rules] container.dirty(name) if num_of_removed_rules > 0: report( ngettext('Removed %d unused CSS style rule', 'Removed %d unused CSS style rules', num_of_removed_rules) % num_of_removed_rules) else: report(_('No unused CSS style rules found')) if remove_unused_classes: if num_of_removed_classes > 0: report( ngettext('Removed %d unused class from the HTML', 'Removed %d unused classes from the HTML', num_of_removed_classes) % num_of_removed_classes) else: report(_('No unused class attributes found')) return num_of_removed_rules + num_of_removed_classes > 0