def find_rule(raw, rule_address): import tinycss parser = tinycss.make_full_parser() sheet = parser.parse_stylesheet(raw) rules = sheet.rules ans = None, None while rule_address: try: r = rules[rule_address[0]] except IndexError: return None, None else: ans = r.line, r.column rule_address = rule_address[1:] if rule_address: rules = getattr(r, 'rules', ()) return ans
def handle_media_queries(raw): # cssutils cannot handle CSS 3 media queries. We look for media queries # that use amzn-mobi or amzn-kf8 and map them to a simple @media screen # rule. See https://bugs.launchpad.net/bugs/1406708 for an example import tinycss parser = tinycss.make_full_parser() def replace(m): sheet = parser.parse_stylesheet(m.group() + '}') for mq in sheet.rules[0].media: # Only accept KF8 media types if (mq.media_type, mq.negated) in {('amzn-mobi', True), ('amzn-kf8', False)}: return '@media screen {' return m.group() return re.sub(r'@media\s[^{]*{', replace, raw)
def handle_media_queries(raw): # cssutils cannot handle CSS 3 media queries. We look for media queries # that use amzn-mobi or amzn-kf8 and map them to a simple @media screen # rule. See https://bugs.launchpad.net/bugs/1406708 for an example import tinycss parser = tinycss.make_full_parser() def replace(m): sheet = parser.parse_stylesheet(m.group() + '}') if len(sheet.rules) > 0: for mq in sheet.rules[0].media: # Only accept KF8 media types if (mq.media_type, mq.negated) in {('amzn-mobi', True), ('amzn-kf8', False)}: return '@media screen {' else: # Empty sheet, doesn't matter what we use return '@media screen {' return m.group() return re.sub(r'@media\s[^{]*?[{;]', replace, raw)
def css_data(container, book_locale, result_data, *args): import tinycss from tinycss.css21 import RuleSet, ImportRule def css_rules(file_name, rules, sourceline=0): ans = [] for rule in rules: if isinstance(rule, RuleSet): selector = rule.selector.as_css() ans.append( CSSRule( selector, RuleLocation(file_name, sourceline + rule.line, rule.column))) elif isinstance(rule, ImportRule): import_name = safe_href_to_name(container, rule.uri, file_name) if import_name and container.exists(import_name): ans.append(import_name) elif getattr(rule, 'rules', False): ans.extend(css_rules(file_name, rule.rules, sourceline)) return ans parser = tinycss.make_full_parser() importable_sheets = {} html_sheets = {} spine_names = {name for name, is_linear in container.spine_names} style_path, link_path = XPath('//h:style'), XPath('//h:link/@href') for name, mt in iteritems(container.mime_map): if mt in OEB_STYLES: importable_sheets[name] = css_rules( name, parser.parse_stylesheet(container.raw_data(name)).rules) elif mt in OEB_DOCS and name in spine_names: html_sheets[name] = [] for style in style_path(container.parsed(name)): if style.get('type', 'text/css') == 'text/css' and style.text: html_sheets[name].append( css_rules( name, parser.parse_stylesheet( force_unicode(style.text, 'utf-8')).rules, style.sourceline - 1)) rule_map = defaultdict(lambda: defaultdict(list)) def rules_in_sheet(sheet): for rule in sheet: if isinstance(rule, CSSRule): yield rule else: # @import rule isheet = importable_sheets.get(rule) if isheet is not None: for irule in rules_in_sheet(isheet): yield irule def sheets_for_html(name, root): for href in link_path(root): tname = safe_href_to_name(container, href, name) sheet = importable_sheets.get(tname) if sheet is not None: yield sheet tt_cache = {} def tag_text(elem): ans = tt_cache.get(elem) if ans is None: tag = elem.tag.rpartition('}')[-1] if elem.attrib: attribs = ' '.join( '%s="%s"' % (k, prepare_string_for_xml(elem.get(k, ''), True)) for k in elem.keys()) return '<%s %s>' % (tag, attribs) ans = tt_cache[elem] = '<%s>' % tag def matches_for_selector(selector, select, class_map, rule): lsel = selector.lower() try: matches = tuple(select(selector)) except SelectorError: return () for elem in matches: for cls in elem.get('class', '').split(): if '.' + cls.lower() in lsel: class_map[cls][elem].append(rule) return (MatchLocation(tag_text(elem), elem.sourceline) for elem in matches) class_map = defaultdict(lambda: defaultdict(list)) for name, inline_sheets in iteritems(html_sheets): root = container.parsed(name) cmap = defaultdict(lambda: defaultdict(list)) for elem in root.xpath('//*[@class]'): for cls in elem.get('class', '').split(): cmap[cls][elem] = [] select = Select(root, ignore_inappropriate_pseudo_classes=True) for sheet in chain(sheets_for_html(name, root), inline_sheets): for rule in rules_in_sheet(sheet): rule_map[rule][name].extend( matches_for_selector(rule.selector, select, cmap, rule)) for cls, elem_map in iteritems(cmap): class_elements = class_map[cls][name] for elem, usage in iteritems(elem_map): class_elements.append( ClassElement(name, elem.sourceline, elem.get('class'), tag_text(elem), tuple(usage))) result_data['classes'] = ans = [] for cls, name_map in iteritems(class_map): la = tuple( ClassFileMatch(name, tuple(class_elements), numeric_sort_key(name)) for name, class_elements in iteritems(name_map) if class_elements) num_of_matches = sum( sum(len(ce.matched_rules) for ce in cfm.class_elements) for cfm in la) ans.append(ClassEntry(cls, num_of_matches, la, numeric_sort_key(cls))) ans = [] for rule, loc_map in iteritems(rule_map): la = tuple( CSSFileMatch(name, tuple(locations), numeric_sort_key(name)) for name, locations in iteritems(loc_map) if locations) count = sum(len(fm.locations) for fm in la) ans.append(CSSEntry(rule, count, la, numeric_sort_key(rule.selector))) return ans
def css_data(container, book_locale, result_data, *args): import tinycss from tinycss.css21 import RuleSet, ImportRule def css_rules(file_name, rules, sourceline=0): ans = [] for rule in rules: if isinstance(rule, RuleSet): selector = rule.selector.as_css() ans.append(CSSRule(selector, RuleLocation(file_name, sourceline + rule.line, rule.column))) elif isinstance(rule, ImportRule): import_name = safe_href_to_name(container, rule.uri, file_name) if import_name and container.exists(import_name): ans.append(import_name) elif getattr(rule, 'rules', False): ans.extend(css_rules(file_name, rule.rules, sourceline)) return ans parser = tinycss.make_full_parser() importable_sheets = {} html_sheets = {} spine_names = {name for name, is_linear in container.spine_names} style_path, link_path = XPath('//h:style'), XPath('//h:link/@href') for name, mt in container.mime_map.iteritems(): if mt in OEB_STYLES: importable_sheets[name] = css_rules(name, parser.parse_stylesheet(container.raw_data(name)).rules) elif mt in OEB_DOCS and name in spine_names: html_sheets[name] = [] for style in style_path(container.parsed(name)): if style.get('type', 'text/css') == 'text/css' and style.text: html_sheets[name].append( css_rules(name, parser.parse_stylesheet(force_unicode(style.text, 'utf-8')).rules, style.sourceline - 1)) rule_map = defaultdict(lambda : defaultdict(list)) def rules_in_sheet(sheet): for rule in sheet: if isinstance(rule, CSSRule): yield rule else: # @import rule isheet = importable_sheets.get(rule) if isheet is not None: for irule in rules_in_sheet(isheet): yield irule def sheets_for_html(name, root): for href in link_path(root): tname = safe_href_to_name(container, href, name) sheet = importable_sheets.get(tname) if sheet is not None: yield sheet tt_cache = {} def tag_text(elem): ans = tt_cache.get(elem) if ans is None: tag = elem.tag.rpartition('}')[-1] if elem.attrib: attribs = ' '.join('%s="%s"' % (k, prepare_string_for_xml(elem.get(k, ''), True)) for k in elem.keys()) return '<%s %s>' % (tag, attribs) ans = tt_cache[elem] = '<%s>' % tag def matches_for_selector(selector, select, class_map, rule): lsel = selector.lower() try: matches = tuple(select(selector)) except SelectorError: return () for elem in matches: for cls in elem.get('class', '').split(): if '.' + cls.lower() in lsel: class_map[cls][elem].append(rule) return (MatchLocation(tag_text(elem), elem.sourceline) for elem in matches) class_map = defaultdict(lambda : defaultdict(list)) for name, inline_sheets in html_sheets.iteritems(): root = container.parsed(name) cmap = defaultdict(lambda : defaultdict(list)) for elem in root.xpath('//*[@class]'): for cls in elem.get('class', '').split(): cmap[cls][elem] = [] select = Select(root, ignore_inappropriate_pseudo_classes=True) for sheet in chain(sheets_for_html(name, root), inline_sheets): for rule in rules_in_sheet(sheet): rule_map[rule][name].extend(matches_for_selector(rule.selector, select, cmap, rule)) for cls, elem_map in cmap.iteritems(): class_elements = class_map[cls][name] for elem, usage in elem_map.iteritems(): class_elements.append( ClassElement(name, elem.sourceline, elem.get('class'), tag_text(elem), tuple(usage))) result_data['classes'] = ans = [] for cls, name_map in class_map.iteritems(): la = tuple(ClassFileMatch(name, tuple(class_elements), numeric_sort_key(name)) for name, class_elements in name_map.iteritems() if class_elements) num_of_matches = sum(sum(len(ce.matched_rules) for ce in cfm.class_elements) for cfm in la) ans.append(ClassEntry(cls, num_of_matches, la, numeric_sort_key(cls))) ans = [] for rule, loc_map in rule_map.iteritems(): la = tuple(CSSFileMatch(name, tuple(locations), numeric_sort_key(name)) for name, locations in loc_map.iteritems() if locations) count = sum(len(fm.locations) for fm in la) ans.append(CSSEntry(rule, count, la, numeric_sort_key(rule.selector))) return ans
def collect_font_stats(self): self.page.evaljs('window.font_stats.get_font_face_rules()') font_face_rules = self.page.bridge_value if not isinstance(font_face_rules, list): raise Exception('Unknown error occurred while reading font-face rules') # Weed out invalid font-face rules rules = [] import tinycss parser = tinycss.make_full_parser() for rule in font_face_rules: ff = rule.get('font-family', None) if not ff: continue style = self.parser.parseStyle('font-family:%s'%ff, validate=False) ff = [x.value for x in style.getProperty('font-family').propertyValue] if not ff or ff[0] == 'inherit': continue rule['font-family'] = frozenset(icu_lower(f) for f in ff) src = rule.get('src', None) if not src: continue try: tokens = parser.parse_stylesheet('@font-face { src: %s }' % src).rules[0].declarations[0].value except Exception: self.log.warn('Failed to parse @font-family src: %s' % src) continue for token in tokens: if token.type == 'URI': uv = token.value if uv: sn = self.href_to_name(uv, '@font-face rule') if sn is not None: rule['src'] = sn break else: self.log.warn('The @font-face rule refers to a font file that does not exist in the book: %s' % src) continue normalize_font_properties(rule) rule['width'] = widths[rule['font-stretch']] rule['weight'] = int(rule['font-weight']) rules.append(rule) if not rules and not self.do_embed: return self.font_rule_map[self.container.abspath_to_name(self.current_item)] = rules for rule in rules: self.all_font_rules[rule['src']] = rule for rule in rules: if rule['src'] not in self.font_stats: self.font_stats[rule['src']] = set() self.page.evaljs('window.font_stats.get_font_usage()') font_usage = self.page.bridge_value if not isinstance(font_usage, list): raise Exception('Unknown error occurred while reading font usage') self.page.evaljs('window.font_stats.get_pseudo_element_font_usage()') pseudo_element_font_usage = self.page.bridge_value if not isinstance(pseudo_element_font_usage, list): raise Exception('Unknown error occurred while reading pseudo element font usage') font_usage += get_pseudo_element_font_usage(pseudo_element_font_usage, self.first_letter_pat, self.parser) exclude = {'\n', '\r', '\t'} self.font_usage_map[self.container.abspath_to_name(self.current_item)] = fu = defaultdict(dict) bad_fonts = {'serif', 'sans-serif', 'monospace', 'cursive', 'fantasy', 'sansserif', 'inherit'} for font in font_usage: text = set() for t in font['text']: text |= frozenset(t) text.difference_update(exclude) if not text: continue normalize_font_properties(font) for rule in get_matching_rules(rules, font): self.font_stats[rule['src']] |= text if self.do_embed: ff = [icu_lower(x) for x in font.get('font-family', [])] if ff and ff[0] not in bad_fonts: keys = {'font-weight', 'font-style', 'font-stretch', 'font-family'} key = frozenset(((k, ff[0] if k == 'font-family' else v) for k, v in font.iteritems() if k in keys)) val = fu[key] if not val: val.update({k:(font[k][0] if k == 'font-family' else font[k]) for k in keys}) val['text'] = set() val['text'] |= text self.font_usage_map[self.container.abspath_to_name(self.current_item)] = dict(fu) if self.do_embed: self.page.evaljs('window.font_stats.get_font_families()') font_families = self.page.bridge_value if not isinstance(font_families, dict): raise Exception('Unknown error occurred while reading font families') self.font_spec_map[self.container.abspath_to_name(self.current_item)] = fs = set() for font_dict, text, pseudo in pseudo_element_font_usage: font_families[font_dict['font-family']] = True for raw in font_families.iterkeys(): for x in parse_font_families(self.parser, raw): if x.lower() not in bad_fonts: fs.add(x)
def collect_font_stats(self): self.page.evaljs('window.font_stats.get_font_face_rules()') font_face_rules = self.page.bridge_value if not isinstance(font_face_rules, list): raise Exception('Unknown error occurred while reading font-face rules') # Weed out invalid font-face rules rules = [] import tinycss parser = tinycss.make_full_parser() for rule in font_face_rules: ff = rule.get('font-family', None) if not ff: continue style = self.parser.parseStyle('font-family:%s'%ff, validate=False) ff = [x.value for x in style.getProperty('font-family').propertyValue] if not ff or ff[0] == 'inherit': continue rule['font-family'] = frozenset(icu_lower(f) for f in ff) src = rule.get('src', None) if not src: continue try: tokens = parser.parse_stylesheet('@font-face { src: %s }' % src).rules[0].declarations[0].value except Exception: self.log.warn('Failed to parse @font-family src: %s' % src) continue for token in tokens: if token.type == 'URI': uv = token.value if uv: sn = self.href_to_name(uv, '@font-face rule') if sn is not None: rule['src'] = sn break else: self.log.warn('The @font-face rule refers to a font file that does not exist in the book: %s' % src) continue normalize_font_properties(rule) rule['width'] = widths[rule['font-stretch']] rule['weight'] = int(rule['font-weight']) rules.append(rule) if not rules and not self.do_embed: return self.font_rule_map[self.container.abspath_to_name(self.current_item)] = rules for rule in rules: self.all_font_rules[rule['src']] = rule for rule in rules: if rule['src'] not in self.font_stats: self.font_stats[rule['src']] = set() self.page.evaljs('window.font_stats.get_font_usage()') font_usage = self.page.bridge_value if not isinstance(font_usage, list): raise Exception('Unknown error occurred while reading font usage') self.page.evaljs('window.font_stats.get_pseudo_element_font_usage()') pseudo_element_font_usage = self.page.bridge_value if not isinstance(pseudo_element_font_usage, list): raise Exception('Unknown error occurred while reading pseudo element font usage') font_usage += get_pseudo_element_font_usage(pseudo_element_font_usage, self.first_letter_pat, self.parser) exclude = {'\n', '\r', '\t'} self.font_usage_map[self.container.abspath_to_name(self.current_item)] = fu = defaultdict(dict) bad_fonts = {'serif', 'sans-serif', 'monospace', 'cursive', 'fantasy', 'sansserif', 'inherit'} for font in font_usage: text = set() for t in font['text']: tt = (font['text-transform'] or '').lower() if tt != 'none': if tt == 'uppercase': t = icu_upper(t) elif tt == 'lowercase': t = icu_lower(t) elif tt == 'capitalize': m = self.capitalize_pat.search(t) if m is not None: t += icu_upper(m.group()) fv = (font['font-variant'] or '').lower() if fv in {'smallcaps', 'small-caps', 'all-small-caps', 'petite-caps', 'all-petite-caps', 'unicase'}: t += icu_upper(t) # for renderers that try to fake small-caps by using small normal caps text |= frozenset(t) text.difference_update(exclude) if not text: continue normalize_font_properties(font) for rule in get_matching_rules(rules, font): self.font_stats[rule['src']] |= text if self.do_embed: ff = [icu_lower(x) for x in font.get('font-family', [])] if ff and ff[0] not in bad_fonts: keys = {'font-weight', 'font-style', 'font-stretch', 'font-family'} key = frozenset(((k, ff[0] if k == 'font-family' else v) for k, v in font.iteritems() if k in keys)) val = fu[key] if not val: val.update({k:(font[k][0] if k == 'font-family' else font[k]) for k in keys}) val['text'] = set() val['text'] |= text self.font_usage_map[self.container.abspath_to_name(self.current_item)] = dict(fu) if self.do_embed: self.page.evaljs('window.font_stats.get_font_families()') font_families = self.page.bridge_value if not isinstance(font_families, dict): raise Exception('Unknown error occurred while reading font families') self.font_spec_map[self.container.abspath_to_name(self.current_item)] = fs = set() for font_dict, text, pseudo in pseudo_element_font_usage: font_families[font_dict['font-family']] = True for raw in font_families.iterkeys(): for x in parse_font_families(self.parser, raw): if x.lower() not in bad_fonts: fs.add(x)
def css_data(container, book_locale): import tinycss from tinycss.css21 import RuleSet, ImportRule def css_rules(file_name, rules, sourceline=0): ans = [] for rule in rules: if isinstance(rule, RuleSet): selector = rule.selector.as_css() ans.append(CSSRule(selector, RuleLocation(file_name, sourceline + rule.line, rule.column))) elif isinstance(rule, ImportRule): import_name = safe_href_to_name(container, rule.uri, file_name) if import_name and container.exists(import_name): ans.append(import_name) elif getattr(rule, 'rules', False): ans.extend(css_rules(file_name, rule.rules, sourceline)) return ans parser = tinycss.make_full_parser() importable_sheets = {} html_sheets = {} spine_names = {name for name, is_linear in container.spine_names} style_path, link_path = XPath('//h:style'), XPath('//h:link/@href') for name, mt in container.mime_map.iteritems(): if mt in OEB_STYLES: importable_sheets[name] = css_rules(name, parser.parse_stylesheet(container.raw_data(name)).rules) elif mt in OEB_DOCS and name in spine_names: html_sheets[name] = [] for style in style_path(container.parsed(name)): if style.get('type', 'text/css') == 'text/css' and style.text: html_sheets[name].append( css_rules(name, parser.parse_stylesheet(force_unicode(style.text, 'utf-8')).rules, style.sourceline - 1)) rule_map = defaultdict(lambda : defaultdict(list)) pseudo_pat = re.compile(PSEUDO_PAT, re.I) cache = {} def rules_in_sheet(sheet): for rule in sheet: if isinstance(rule, CSSRule): yield rule else: # @import rule isheet = importable_sheets.get(rule) if isheet is not None: for irule in rules_in_sheet(isheet): yield irule def sheets_for_html(name, root): for href in link_path(root): tname = safe_href_to_name(container, href, name) sheet = importable_sheets.get(tname) if sheet is not None: yield sheet def tag_text(elem): tag = elem.tag.rpartition('}')[-1] if elem.attrib: attribs = ' '.join('%s="%s"' % (k, prepare_string_for_xml(elem.get(k, ''), True)) for k in elem.keys()) return '<%s %s>' % (tag, attribs) return '<%s>' % tag def matches_for_selector(selector, root): selector = pseudo_pat.sub('', selector) selector = MIN_SPACE_RE.sub(r'\1', selector) try: xp = cache[(True, selector)] except KeyError: xp = cache[(True, selector)] = build_selector(selector) try: matches = xp(root) except Exception: return () if not matches: try: xp = cache[(False, selector)] except KeyError: xp = cache[(False, selector)] = build_selector(selector, case_sensitive=False) try: matches = xp(root) except Exception: return () return (MatchLocation(tag_text(elem), elem.sourceline) for elem in matches) for name, inline_sheets in html_sheets.iteritems(): root = container.parsed(name) for sheet in chain(sheets_for_html(name, root), inline_sheets): for rule in rules_in_sheet(sheet): rule_map[rule][name].extend(matches_for_selector(rule.selector, root)) ans = [] for rule, loc_map in rule_map.iteritems(): la = tuple(CSSFileMatch(name, tuple(locations), numeric_sort_key(name)) for name, locations in loc_map.iteritems() if locations) count = sum(len(fm.locations) for fm in la) ans.append(CSSEntry(rule, count, la, numeric_sort_key(rule.selector))) return ans