def _process_nbconvert_css(self, css): print("Processing downloaded ipython notebook CSS.") try: css = css.split(IPYTHON_NOTEBOOK_DECLARE_STR.encode())[1] css = IPYTHON_NOTEBOOK_DECLARE_STR.encode() + css except IndexError: raise ValueError("Bad splitter for notebook css %s" % IPYTHON_NOTEBOOK_DECLARE_STR) print("Done.") if REPLACE_HIGHLIGHT_WITH_CODEHILITE: css = css.replace(HIGHLIGHT_CSS_CLASS.encode() + b" ", CODEHILITE_CSS_CLASS.encode() + b" ") import tinycss2 css_parsed, encoding = tinycss2.parse_stylesheet_bytes(css) for n in css_parsed: if isinstance(n, tinycss2.ast.QualifiedRule): n.prelude[0:0] = [ tinycss2.ast.LiteralToken(None, None, "."), tinycss2.ast.IdentToken( None, None, "relate-notebook-container"), tinycss2.ast.WhitespaceToken(None, None, " "), ] result = tinycss2.serialize(css_parsed).encode(encoding.name) return result
def preprocess_descriptors(base_url, descriptors): """Filter unsupported names and values for descriptors. Log a warning for every ignored descriptor. Return a iterable of ``(name, value)`` tuples. """ for descriptor in descriptors: if descriptor.type != 'declaration' or descriptor.important: continue tokens = remove_whitespace(descriptor.value) try: # Use list() to consume generators now and catch any error. if descriptor.name not in DESCRIPTORS: raise InvalidValues('descriptor not supported') function = DESCRIPTORS[descriptor.name] if function.wants_base_url: value = function(tokens, base_url) else: value = function(tokens) if value is None: raise InvalidValues result = ((descriptor.name, value),) except InvalidValues as exc: LOGGER.warning( 'Ignored `%s:%s` at %i:%i, %s.', descriptor.name, tinycss2.serialize(descriptor.value), descriptor.source_line, descriptor.source_column, exc.args[0] if exc.args and exc.args[0] else 'invalid value') continue for long_name, value in result: yield long_name.replace('-', '_'), value
def parse_media_query(tokens): tokens = remove_whitespace(tokens) if not tokens: return ['all'] else: media = [] for part in split_on_comma(tokens): types = [token.type for token in part] if types == ['ident']: media.append(part[0].lower_value) else: LOGGER.warning( 'Expected a media type, got %s', tinycss2.serialize(part)) return return media
def parse_declarations(input): normal_declarations = [] important_declarations = [] for declaration in tinycss2.parse_declaration_list(input): # TODO: warn on error # if declaration.type == 'error': if (declaration.type == 'declaration' and not declaration.name.startswith('-')): # Serializing perfectly good tokens just to re-parse them later :( value = tinycss2.serialize(declaration.value).strip() declarations = ( important_declarations if declaration.important else normal_declarations) declarations.append((declaration.lower_name, value)) return normal_declarations, important_declarations
def parse_and_validate(self, stylesheet_source): if len(stylesheet_source) > (MAX_SIZE_KIB * 1024): return "", [ValidationError(0, "TOO_BIG", {"size": MAX_SIZE_KIB})] nodes = tinycss2.parse_stylesheet(stylesheet_source) source_lines = stylesheet_source.splitlines() backslash_errors = self.check_for_evil_codepoints(source_lines) validation_errors = self.validate_rule_list(nodes) errors = [] for error in itertools.chain(backslash_errors, validation_errors): error._source_lines = source_lines errors.append(error) errors.sort(key=lambda e: e.line) if not errors: serialized = rcssmin.cssmin(tinycss2.serialize(nodes)) else: serialized = "" return serialized.encode("utf-8"), errors
def validation_error(level, reason): getattr(LOGGER, level)( 'Ignored `%s:%s` at %i:%i, %s.', declaration.name, serialize(declaration.value), declaration.source_line, declaration.source_column, reason)
def preprocess_declarations(base_url, declarations): """Expand shorthand properties, filter unsupported properties and values. Log a warning for every ignored declaration. Return a iterable of ``(name, value, important)`` tuples. """ for declaration in declarations: if declaration.type == 'error': LOGGER.warning( 'Error: %s at %i:%i.', declaration.message, declaration.source_line, declaration.source_column) if declaration.type != 'declaration': continue name = declaration.lower_name def validation_error(level, reason): getattr(LOGGER, level)( 'Ignored `%s:%s` at %i:%i, %s.', declaration.name, serialize(declaration.value), declaration.source_line, declaration.source_column, reason) if name in NOT_PRINT_MEDIA: validation_error( 'debug', 'the property does not apply for the print media') continue if name.startswith(PREFIX): unprefixed_name = name[len(PREFIX):] if unprefixed_name in PROPRIETARY: name = unprefixed_name elif unprefixed_name in UNSTABLE: LOGGER.warning( 'Deprecated `%s:%s` at %i:%i, ' 'prefixes on unstable attributes are deprecated, ' 'use `%s` instead.', declaration.name, serialize(declaration.value), declaration.source_line, declaration.source_column, unprefixed_name) name = unprefixed_name else: LOGGER.warning( 'Ignored `%s:%s` at %i:%i, ' 'prefix on this attribute is not supported, ' 'use `%s` instead.', declaration.name, serialize(declaration.value), declaration.source_line, declaration.source_column, unprefixed_name) continue if name.startswith('-'): validation_error('debug', 'prefixed selectors are ignored') continue expander_ = EXPANDERS.get(name, validate_non_shorthand) tokens = remove_whitespace(declaration.value) try: # Use list() to consume generators now and catch any error. result = list(expander_(base_url, name, tokens)) except InvalidValues as exc: validation_error( 'warning', exc.args[0] if exc.args and exc.args[0] else 'invalid value') continue important = declaration.important for long_name, value in result: yield long_name.replace('-', '_'), value, important
def __build_data_structures(self, multiprop): """Call after stylesheet has been parsed :param multiprop: As in __init__ :returns: (props, rules) props is dict from prop_name (string) -> (!important (bool), specificity (a, b, c)) -> dict from (selector (parsed_tree), value (normalised string with !important)) -> pair of: line_no (int, not precise line_no but keeps order, and is last occurrence of sel, v in file) boolean -- True if rule appears only once in file rules is set of pairs (S, P) where S is set of cssselect Selector and props is a list of pairs (p, v) of strings property and value """ def get_decl_value(decl): """Returns decl.value normalised and with !important appended if needed""" val = tinycss2.serialize(decl.value) normed = self.__normalise_css_value(val) important = "!important" if decl.important else "" return normed + important props = defaultdict(lambda: defaultdict(dict)) rules = list() line_no = 1 for rule in islice(self.stylesheet, self.first_rule_idx, self.last_rule_idx): sels = set() declarations = None if rule.type != "qualified-rule": rule_str = tinycss2.serialize([rule]) self.ignored_rules.append(rule_str) print "WARNING: merely copying rule ", rule_str continue parsed_rule_decls = tinycss2.parse_declaration_list( rule.content, skip_whitespace=True, skip_comments=True) rule_declarations = [] for i, decl in enumerate(parsed_rule_decls): if decl.type == "error": print "WARNING: parser error in", i + 1, "th declaration of" print tinycss2.serialize(rule.content) print decl.message print "Ignoring declaration" continue rule_declarations.append(decl) if not multiprop: declarations = rule_declarations else: declarations = [] # keep !important and not important separate, to be on the safe # side combined = defaultdict(list) for decl in rule_declarations: val = self.__normalise_css_value( tinycss2.serialize(decl.value)) combined[(decl.lower_name, decl.important)].append(val) def build_val(vals): return multiprop_separator.join(vals) for ((name, priority), vals) in combined.iteritems(): decl = FakeDeclaration(name, priority, build_val(vals)) declarations.append(decl) decls = [(decl.lower_name, get_decl_value(decl)) for decl in rule_declarations] # reset line number to beginning of rule after each selector line_no_start = line_no for sel in self.__parse_selector(rule.prelude): sels.add(sel) line_no = line_no_start for decl in declarations: v = get_decl_value(decl) specificity = (decl.important, sel.parsed_tree.specificity()) tup = (sel, v) m = props[decl.lower_name][specificity] m[tup] = (line_no, not tup in m) line_no += 1 rules.append(CSSRule(sels, decls)) return (props, rules)
def __str__(self): s = [] for rule in self.stylesheet: s.append(tinycss2.serialize(rule)) return ''.join(s)
import tinycss2 # Parse CSS and add rules to the matcher matcher = cssselect2.Matcher() rules = tinycss2.parse_stylesheet(''' body { font-size: 2em } body p { background: red } p { color: blue } ''', skip_whitespace=True) for rule in rules: selectors = cssselect2.compile_selector_list(rule.prelude) selector_string = tinycss2.serialize(rule.prelude) content_string = tinycss2.serialize(rule.content) payload = (selector_string, content_string) for selector in selectors: matcher.add_selector(selector, payload) # Parse HTML and find CSS rules applying to each tag html_tree = ElementTree.fromstring(''' <html> <body> <p>Test</p> </body> </html> ''')
def fixCss(self, soup): ''' So, because the color scheme of our interface can vary from the original, we need to fix any cases of white text. However, I want to preserve *most* of the color information. Therefore, we look at all the inline CSS, and just patch where needed. ''' # Match the CSS ASCII color classes hexr = re.compile('((?:[a-fA-F0-9]{6})|(?:[a-fA-F0-9]{3}))') def clamp_hash_token(intok, high): old = hexr.findall(intok.value) for match in old: color = webcolors.hex_to_rgb("#" + match) mean = sum(color) / len(color) if high: if mean > 150: color = tuple((max(255 - cval, 0) for cval in color)) new = webcolors.rgb_to_hex(color) intok.value = intok.value.replace(match, new) else: if mean < 100: color = tuple((min(cval, 100) for cval in color)) new = webcolors.rgb_to_hex(color).replace("#", "") intok.value = intok.value.replace(match, new) return intok def clamp_css_color(toks, high=True): toks = [tok for tok in toks if tok.type != 'whitespace'] for tok in toks: if tok.type == 'hash': clamp_hash_token(tok, high) if tok.type == 'string': tok.value = "" return toks hascss = soup.find_all(True, attrs={"style": True}) initial_keys = ['font', 'font-family'] empty_keys = [ 'width', 'height', 'display', 'max-width', 'max-height', 'background-image', ] foreground_color_keys = [ 'color', ] background_color_keys = [ 'background', 'background-color', ] for item in hascss: if item['style']: try: parsed_style = tinycss2.parse_declaration_list( item['style']) for style_chunk in parsed_style: if style_chunk.type == 'declaration': if any([ dec_str == style_chunk.name for dec_str in initial_keys ]): style_chunk.value = [ tinycss2.ast.IdentToken( 1, 1, "Sans-Serif") ] if any([ dec_str == style_chunk.name for dec_str in empty_keys ]): style_chunk.value = [] if any([ dec_str == style_chunk.name for dec_str in foreground_color_keys ]): style_chunk.value = clamp_css_color( style_chunk.value) if any([ dec_str == style_chunk.name for dec_str in background_color_keys ]): style_chunk.value = clamp_css_color( style_chunk.value, high=False) # Force overflow to be visible if style_chunk.name == "overflow": style_chunk.value = [ tinycss2.ast.IdentToken(1, 1, "visible") ] parsed_style = [ chunk for chunk in parsed_style if chunk.value ] item['style'] = tinycss2.serialize(parsed_style) except AttributeError: # If the parser encountered an error, it'll produce 'ParseError' tokens without # the 'value' attribute. This produces attribute errors. # If the style is f****d, just clobber it. item['style'] = "" return soup
def preprocess_stylesheet(device_media_type, base_url, stylesheet_rules, url_fetcher, matcher, page_rules, fonts, font_config, ignore_imports=False): """Do the work that can be done early on stylesheet, before they are in a document. """ for rule in stylesheet_rules: if getattr(rule, 'content', None) is None and ( rule.type != 'at-rule' or rule.lower_at_keyword != 'import'): continue if rule.type == 'qualified-rule': declarations = list( preprocess_declarations( base_url, tinycss2.parse_declaration_list(rule.content))) if declarations: try: selectors = cssselect2.compile_selector_list(rule.prelude) for selector in selectors: matcher.add_selector(selector, declarations) if selector.pseudo_element not in PSEUDO_ELEMENTS: raise cssselect2.SelectorError( 'Unknown pseudo-element: %s' % selector.pseudo_element) ignore_imports = True except cssselect2.SelectorError as exc: LOGGER.warning("Invalid or unsupported selector '%s', %s", tinycss2.serialize(rule.prelude), exc) continue else: ignore_imports = True elif rule.type == 'at-rule' and rule.lower_at_keyword == 'import': if ignore_imports: LOGGER.warning( '@import rule "%s" not at the beginning of the ' 'the whole rule was ignored at %s:%s.', tinycss2.serialize(rule.prelude), rule.source_line, rule.source_column) continue tokens = remove_whitespace(rule.prelude) if tokens and tokens[0].type in ('url', 'string'): url = tokens[0].value else: continue media = parse_media_query(tokens[1:]) if media is None: LOGGER.warning( 'Invalid media type "%s" ' 'the whole @import rule was ignored at %s:%s.', tinycss2.serialize(rule.prelude), rule.source_line, rule.source_column) continue if not evaluate_media_query(media, device_media_type): continue url = url_join(base_url, url, allow_relative=False, context='@import at %s:%s', context_args=(rule.source_line, rule.source_column)) if url is not None: try: CSS(url=url, url_fetcher=url_fetcher, media_type=device_media_type, font_config=font_config, matcher=matcher, page_rules=page_rules) except URLFetchingError as exc: LOGGER.error('Failed to load stylesheet at %s : %s', url, exc) elif rule.type == 'at-rule' and rule.lower_at_keyword == 'media': media = parse_media_query(rule.prelude) if media is None: LOGGER.warning( 'Invalid media type "%s" ' 'the whole @media rule was ignored at %s:%s.', tinycss2.serialize(rule.prelude), rule.source_line, rule.source_column) continue ignore_imports = True if not evaluate_media_query(media, device_media_type): continue content_rules = tinycss2.parse_rule_list(rule.content) preprocess_stylesheet(device_media_type, base_url, content_rules, url_fetcher, matcher, page_rules, fonts, font_config, ignore_imports=True) elif rule.type == 'at-rule' and rule.lower_at_keyword == 'page': data = parse_page_selectors(rule) if data is None: LOGGER.warning( 'Unsupported @page selector "%s", ' 'the whole @page rule was ignored at %s:%s.', tinycss2.serialize(rule.prelude), rule.source_line, rule.source_column) continue ignore_imports = True for page_type in data: specificity = page_type.pop('specificity') page_type = PageType(**page_type) # Use a double lambda to have a closure that holds page_types match = (lambda page_type: lambda page_names: list( matching_page_types(page_type, names=page_names)) )(page_type) content = tinycss2.parse_declaration_list(rule.content) declarations = list(preprocess_declarations(base_url, content)) if declarations: selector_list = [(specificity, None, match)] page_rules.append((rule, selector_list, declarations)) for margin_rule in content: if margin_rule.type != 'at-rule' or (margin_rule.content is None): continue declarations = list( preprocess_declarations( base_url, tinycss2.parse_declaration_list( margin_rule.content))) if declarations: selector_list = [ (specificity, '@' + margin_rule.lower_at_keyword, match) ] page_rules.append( (margin_rule, selector_list, declarations)) elif rule.type == 'at-rule' and rule.lower_at_keyword == 'font-face': ignore_imports = True content = tinycss2.parse_declaration_list(rule.content) rule_descriptors = dict(preprocess_descriptors(base_url, content)) for key in ('src', 'font_family'): if key not in rule_descriptors: LOGGER.warning( "Missing %s descriptor in '@font-face' rule at %s:%s", key.replace('_', '-'), rule.source_line, rule.source_column) break else: if font_config is not None: font_filename = font_config.add_font_face( rule_descriptors, url_fetcher) if font_filename: fonts.append(font_filename)
def preprocess_declarations(base_url, declarations): """Expand shorthand properties, filter unsupported properties and values. Log a warning for every ignored declaration. Return a iterable of ``(name, value, important)`` tuples. """ for declaration in declarations: if declaration.type == 'error': LOGGER.warning( 'Error: %s at %i:%i.', declaration.message, declaration.source_line, declaration.source_column) if declaration.type != 'declaration': continue name = declaration.lower_name def validation_error(level, reason): getattr(LOGGER, level)( 'Ignored `%s:%s` at %i:%i, %s.', declaration.name, serialize(declaration.value), declaration.source_line, declaration.source_column, reason) if name in NOT_PRINT_MEDIA: validation_error( 'warning', 'the property does not apply for the print media') continue if name.startswith(PREFIX): unprefixed_name = name[len(PREFIX):] if unprefixed_name in PROPRIETARY: name = unprefixed_name elif unprefixed_name in UNSTABLE: LOGGER.warning( 'Deprecated `%s:%s` at %i:%i, ' 'prefixes on unstable attributes are deprecated, ' 'use `%s` instead.', declaration.name, serialize(declaration.value), declaration.source_line, declaration.source_column, unprefixed_name) name = unprefixed_name else: LOGGER.warning( 'Ignored `%s:%s` at %i:%i, ' 'prefix on this attribute is not supported, ' 'use `%s` instead.', declaration.name, serialize(declaration.value), declaration.source_line, declaration.source_column, unprefixed_name) continue expander_ = EXPANDERS.get(name, validate_non_shorthand) tokens = remove_whitespace(declaration.value) try: # Use list() to consume generators now and catch any error. result = list(expander_(base_url, name, tokens)) except InvalidValues as exc: validation_error( 'warning', exc.args[0] if exc.args and exc.args[0] else 'invalid value') continue important = declaration.important for long_name, value in result: yield long_name.replace('-', '_'), value, important
def fixCss(self, soup): ''' So, because the color scheme of our interface can vary from the original, we need to fix any cases of white text. However, I want to preserve *most* of the color information. Therefore, we look at all the inline CSS, and just patch where needed. ''' # Match the CSS ASCII color classes hexr = re.compile('((?:[a-fA-F0-9]{6})|(?:[a-fA-F0-9]{3}))') def clamp_hash_token(intok, high): old = hexr.findall(intok.value) for match in old: color = webcolors.hex_to_rgb("#"+match) mean = sum(color)/len(color) if high: if mean > 150: color = tuple((max(255-cval, 0) for cval in color)) new = webcolors.rgb_to_hex(color) intok.value = intok.value.replace(match, new) else: if mean < 100: color = tuple((min(cval, 100) for cval in color)) new = webcolors.rgb_to_hex(color).replace("#", "") intok.value = intok.value.replace(match, new) return intok def clamp_css_color(toks, high=True): toks = [tok for tok in toks if tok.type != 'whitespace'] for tok in toks: if tok.type == 'hash': clamp_hash_token(tok, high) if tok.type == 'string': tok.value = "" return toks hascss = soup.find_all(True, attrs={"style" : True}) initial_keys = [ 'font', 'font-family' ] empty_keys = [ 'width', 'height', 'display', 'max-width', 'max-height', 'background-image', ] foreground_color_keys = [ 'color', ] background_color_keys = [ 'background', 'background-color', ] for item in hascss: if item['style']: try: parsed_style = tinycss2.parse_declaration_list(item['style']) for style_chunk in parsed_style: if style_chunk.type == 'declaration': if any([dec_str == style_chunk.name for dec_str in initial_keys]): style_chunk.value = [tinycss2.ast.IdentToken(1, 1, "Sans-Serif")] if any([dec_str == style_chunk.name for dec_str in empty_keys]): style_chunk.value = [] if any([dec_str == style_chunk.name for dec_str in foreground_color_keys]): style_chunk.value = clamp_css_color(style_chunk.value) if any([dec_str == style_chunk.name for dec_str in background_color_keys]): style_chunk.value = clamp_css_color(style_chunk.value, high=False) # Force overflow to be visible if style_chunk.name == "overflow": style_chunk.value = [tinycss2.ast.IdentToken(1, 1, "visible")] parsed_style = [chunk for chunk in parsed_style if chunk.value] item['style'] = tinycss2.serialize(parsed_style) except AttributeError: # If the parser encountered an error, it'll produce 'ParseError' tokens without # the 'value' attribute. This produces attribute errors. # If the style is f****d, just clobber it. item['style'] = "" return soup
def preprocess_stylesheet(device_media_type, base_url, stylesheet_rules, url_fetcher, matcher, page_rules, fonts, font_config, ignore_imports=False): """Do the work that can be done early on stylesheet, before they are in a document. """ for rule in stylesheet_rules: if getattr(rule, 'content', None) is None and ( rule.type != 'at-rule' or rule.lower_at_keyword != 'import'): continue if rule.type == 'qualified-rule': declarations = list(preprocess_declarations( base_url, tinycss2.parse_declaration_list(rule.content))) if declarations: logger_level = WARNING try: selectors = cssselect2.compile_selector_list(rule.prelude) for selector in selectors: matcher.add_selector(selector, declarations) if selector.pseudo_element not in PSEUDO_ELEMENTS: if selector.pseudo_element.startswith('-'): logger_level = DEBUG raise cssselect2.SelectorError( 'ignored prefixed pseudo-element: %s' % selector.pseudo_element) else: raise cssselect2.SelectorError( 'unknown pseudo-element: %s' % selector.pseudo_element) ignore_imports = True except cssselect2.SelectorError as exc: LOGGER.log( logger_level, "Invalid or unsupported selector '%s', %s", tinycss2.serialize(rule.prelude), exc) continue else: ignore_imports = True elif rule.type == 'at-rule' and rule.lower_at_keyword == 'import': if ignore_imports: LOGGER.warning('@import rule "%s" not at the beginning of the ' 'the whole rule was ignored at %s:%s.', tinycss2.serialize(rule.prelude), rule.source_line, rule.source_column) continue tokens = remove_whitespace(rule.prelude) if tokens and tokens[0].type in ('url', 'string'): url = tokens[0].value else: continue media = media_queries.parse_media_query(tokens[1:]) if media is None: LOGGER.warning('Invalid media type "%s" ' 'the whole @import rule was ignored at %s:%s.', tinycss2.serialize(rule.prelude), rule.source_line, rule.source_column) continue if not media_queries.evaluate_media_query( media, device_media_type): continue url = url_join( base_url, url, allow_relative=False, context='@import at %s:%s', context_args=(rule.source_line, rule.source_column)) if url is not None: try: CSS( url=url, url_fetcher=url_fetcher, media_type=device_media_type, font_config=font_config, matcher=matcher, page_rules=page_rules) except URLFetchingError as exc: LOGGER.error( 'Failed to load stylesheet at %s : %s', url, exc) elif rule.type == 'at-rule' and rule.lower_at_keyword == 'media': media = media_queries.parse_media_query(rule.prelude) if media is None: LOGGER.warning('Invalid media type "%s" ' 'the whole @media rule was ignored at %s:%s.', tinycss2.serialize(rule.prelude), rule.source_line, rule.source_column) continue ignore_imports = True if not media_queries.evaluate_media_query( media, device_media_type): continue content_rules = tinycss2.parse_rule_list(rule.content) preprocess_stylesheet( device_media_type, base_url, content_rules, url_fetcher, matcher, page_rules, fonts, font_config, ignore_imports=True) elif rule.type == 'at-rule' and rule.lower_at_keyword == 'page': data = parse_page_selectors(rule) if data is None: LOGGER.warning( 'Unsupported @page selector "%s", ' 'the whole @page rule was ignored at %s:%s.', tinycss2.serialize(rule.prelude), rule.source_line, rule.source_column) continue ignore_imports = True for page_type in data: specificity = page_type.pop('specificity') page_type = PageType(**page_type) content = tinycss2.parse_declaration_list(rule.content) declarations = list(preprocess_declarations(base_url, content)) if declarations: selector_list = [(specificity, None, page_type)] page_rules.append((rule, selector_list, declarations)) for margin_rule in content: if margin_rule.type != 'at-rule' or ( margin_rule.content is None): continue declarations = list(preprocess_declarations( base_url, tinycss2.parse_declaration_list(margin_rule.content))) if declarations: selector_list = [( specificity, '@' + margin_rule.lower_at_keyword, page_type)] page_rules.append( (margin_rule, selector_list, declarations)) elif rule.type == 'at-rule' and rule.lower_at_keyword == 'font-face': ignore_imports = True content = tinycss2.parse_declaration_list(rule.content) rule_descriptors = dict(preprocess_descriptors(base_url, content)) for key in ('src', 'font_family'): if key not in rule_descriptors: LOGGER.warning( "Missing %s descriptor in '@font-face' rule at %s:%s", key.replace('_', '-'), rule.source_line, rule.source_column) break else: if font_config is not None: font_filename = font_config.add_font_face( rule_descriptors, url_fetcher) if font_filename: fonts.append(font_filename)
def parse_rules(css: str): """ Apply a CSS stylesheet to an XHTML tree. The application is naive and should not be expected to be browser-grade. CSS declarationerties on specific elements can be returned using EasyXmlElement.get_css_declarationerty() For example, for node in dom.xpath("//em")" print(node.get_css_declarationerty("font-style")) """ rules = [] # Parse the stylesheet to break it into rules and their associated declarationerties for token in tinycss2.parse_stylesheet(css, skip_comments=True): if token.type == "error": raise se.InvalidCssException(token.message) # A CSS rule if token.type == "qualified-rule": selectors = tinycss2.serialize(token.prelude).strip() # First, get a list of declarations within the { } block. # Parse each declaration and add it to the rule declarations = [] for item in tinycss2.parse_declaration_list(token.content): if item.type == "error": raise se.InvalidCssException( "Couldn’t parse CSS. Exception: {token.message}") if item.type == "declaration": declaration = CssDeclaration(item.lower_name, item.value, item.important) declarations += declaration.expand() # We can have multiple selectors in a rule separated by `,` for selector in selectors.split(","): # Skip selectors containing pseudo elements if "::" in selector: continue selector = selector.strip() rule = CssRule(selector) # Calculate the specificity of the selector # See https://www.w3.org/TR/CSS2/cascade.html#specificity # a = 0 always (no style attributes apply here) # First remove strings, because they can contain `:` selector = regex.sub(r"\"[^\"]+?\"", "", selector) # b = number of ID attributes specificity_b = len(regex.findall(r"#", selector)) # c = number of other attributes or pseudo classes specificity_c = len(regex.findall(r"[\.\[\:]", selector)) # d = number of element names and pseudo elements (which will be 0 for us) specificity_d = len( regex.findall(r"(?:^[a-z]|\s[a-z])", selector)) rule.specificity = (specificity_b, specificity_c, specificity_d) rule.specificity_number = specificity_b * 100 + specificity_c * 10 + specificity_d # Done with specificity, assign the declarations and save the rule rule.declarations = declarations rules.append(rule) return rules
def preprocess_stylesheet(device_media_type, base_url, stylesheet_rules, url_fetcher, matcher, page_rules, fonts, font_config, counter_style, ignore_imports=False): """Do the work that can be done early on stylesheet, before they are in a document. """ for rule in stylesheet_rules: if getattr(rule, 'content', None) is None and ( rule.type != 'at-rule' or rule.lower_at_keyword != 'import'): continue if rule.type == 'qualified-rule': declarations = list( preprocess_declarations( base_url, tinycss2.parse_declaration_list(rule.content))) if declarations: logger_level = WARNING try: selectors = cssselect2.compile_selector_list(rule.prelude) for selector in selectors: matcher.add_selector(selector, declarations) if selector.pseudo_element not in PSEUDO_ELEMENTS: if selector.pseudo_element.startswith('-'): logger_level = DEBUG raise cssselect2.SelectorError( 'ignored prefixed pseudo-element: ' f'{selector.pseudo_element}') else: raise cssselect2.SelectorError( 'unknown pseudo-element: ' f'{selector.pseudo_element}') ignore_imports = True except cssselect2.SelectorError as exc: LOGGER.log(logger_level, "Invalid or unsupported selector '%s', %s", tinycss2.serialize(rule.prelude), exc) continue else: ignore_imports = True elif rule.type == 'at-rule' and rule.lower_at_keyword == 'import': if ignore_imports: LOGGER.warning( '@import rule %r not at the beginning of the ' 'the whole rule was ignored at %d:%d.', tinycss2.serialize(rule.prelude), rule.source_line, rule.source_column) continue tokens = remove_whitespace(rule.prelude) if tokens and tokens[0].type in ('url', 'string'): url = tokens[0].value else: continue media = media_queries.parse_media_query(tokens[1:]) if media is None: LOGGER.warning( 'Invalid media type %r ' 'the whole @import rule was ignored at %d:%d.', tinycss2.serialize(rule.prelude), rule.source_line, rule.source_column) continue if not media_queries.evaluate_media_query(media, device_media_type): continue url = url_join(base_url, url, allow_relative=False, context='@import at %d:%d', context_args=(rule.source_line, rule.source_column)) if url is not None: try: CSS(url=url, url_fetcher=url_fetcher, media_type=device_media_type, font_config=font_config, counter_style=counter_style, matcher=matcher, page_rules=page_rules) except URLFetchingError as exc: LOGGER.error('Failed to load stylesheet at %s : %s', url, exc) elif rule.type == 'at-rule' and rule.lower_at_keyword == 'media': media = media_queries.parse_media_query(rule.prelude) if media is None: LOGGER.warning( 'Invalid media type %r ' 'the whole @media rule was ignored at %d:%d.', tinycss2.serialize(rule.prelude), rule.source_line, rule.source_column) continue ignore_imports = True if not media_queries.evaluate_media_query(media, device_media_type): continue content_rules = tinycss2.parse_rule_list(rule.content) preprocess_stylesheet(device_media_type, base_url, content_rules, url_fetcher, matcher, page_rules, fonts, font_config, counter_style, ignore_imports=True) elif rule.type == 'at-rule' and rule.lower_at_keyword == 'page': data = parse_page_selectors(rule) if data is None: LOGGER.warning( 'Unsupported @page selector %r, ' 'the whole @page rule was ignored at %d:%d.', tinycss2.serialize(rule.prelude), rule.source_line, rule.source_column) continue ignore_imports = True for page_type in data: specificity = page_type.pop('specificity') page_type = PageType(**page_type) content = tinycss2.parse_declaration_list(rule.content) declarations = list(preprocess_declarations(base_url, content)) if declarations: selector_list = [(specificity, None, page_type)] page_rules.append((rule, selector_list, declarations)) for margin_rule in content: if margin_rule.type != 'at-rule' or (margin_rule.content is None): continue declarations = list( preprocess_declarations( base_url, tinycss2.parse_declaration_list( margin_rule.content))) if declarations: selector_list = [ (specificity, '@' + margin_rule.lower_at_keyword, page_type) ] page_rules.append( (margin_rule, selector_list, declarations)) elif rule.type == 'at-rule' and rule.lower_at_keyword == 'font-face': ignore_imports = True content = tinycss2.parse_declaration_list(rule.content) rule_descriptors = dict( preprocess_descriptors('font-face', base_url, content)) for key in ('src', 'font_family'): if key not in rule_descriptors: LOGGER.warning( "Missing %s descriptor in '@font-face' rule at %d:%d", key.replace('_', '-'), rule.source_line, rule.source_column) break else: if font_config is not None: font_filename = font_config.add_font_face( rule_descriptors, url_fetcher) if font_filename: fonts.append(font_filename) elif (rule.type == 'at-rule' and rule.lower_at_keyword == 'counter-style'): name = counters.parse_counter_style_name(rule.prelude, counter_style) if name is None: LOGGER.warning( 'Invalid counter style name %r, the whole ' '@counter-style rule was ignored at %d:%d.', tinycss2.serialize(rule.prelude), rule.source_line, rule.source_column) continue ignore_imports = True content = tinycss2.parse_declaration_list(rule.content) counter = { 'system': None, 'negative': None, 'prefix': None, 'suffix': None, 'range': None, 'pad': None, 'fallback': None, 'symbols': None, 'additive_symbols': None, } rule_descriptors = dict( preprocess_descriptors('counter-style', base_url, content)) for descriptor_name, descriptor_value in rule_descriptors.items(): counter[descriptor_name] = descriptor_value if counter['system'] is None: system = (None, 'symbolic', None) else: system = counter['system'] if system[0] is None: if system[1] in ('cyclic', 'fixed', 'symbolic'): if len(counter['symbols'] or []) < 1: LOGGER.warning( 'In counter style %r at %d:%d, ' 'counter style %r needs at least one symbol', name, rule.source_line, rule.source_column, system[1]) continue elif system[1] in ('alphabetic', 'numeric'): if len(counter['symbols'] or []) < 2: LOGGER.warning( 'In counter style %r at %d:%d, ' 'counter style %r needs at least two symbols', name, rule.source_line, rule.source_column, system[1]) continue elif system[1] == 'additive': if len(counter['additive_symbols'] or []) < 2: LOGGER.warning( 'In counter style %r at %d:%d, ' 'counter style "additive" ' 'needs at least two additive symbols', name, rule.source_line, rule.source_column) continue counter_style[name] = counter
def preprocess_stylesheet(device_media_type, base_url, stylesheet_rules, url_fetcher, matcher, page_rules, fonts, font_config): """Do the work that can be done early on stylesheet, before they are in a document. """ for rule in stylesheet_rules: if rule.type == 'qualified-rule': declarations = list( preprocess_declarations( base_url, tinycss2.parse_declaration_list(rule.content))) if declarations: try: selectors = cssselect2.compile_selector_list(rule.prelude) for selector in selectors: matcher.add_selector(selector, declarations) if selector.pseudo_element not in PSEUDO_ELEMENTS: raise cssselect2.SelectorError( 'Unknown pseudo-element: %s' % selector.pseudo_element) except cssselect2.SelectorError as exc: LOGGER.warning("Invalid or unsupported selector '%s', %s", tinycss2.serialize(rule.prelude), exc) continue elif rule.type == 'at-rule' and rule.at_keyword == 'import': tokens = remove_whitespace(rule.prelude) if tokens and tokens[0].type in ('url', 'string'): url = tokens[0].value else: continue media = parse_media_query(tokens[1:]) if media is None: LOGGER.warning( 'Invalid media type "%s" ' 'the whole @import rule was ignored at %s:%s.', tinycss2.serialize(rule.prelude), rule.source_line, rule.source_column) if not evaluate_media_query(media, device_media_type): continue url = url_join(base_url, url, allow_relative=False, context='@import at %s:%s', context_args=(rule.source_line, rule.source_column)) if url is not None: try: CSS(url=url, url_fetcher=url_fetcher, media_type=device_media_type, font_config=font_config, matcher=matcher, page_rules=page_rules) except URLFetchingError as exc: LOGGER.error('Failed to load stylesheet at %s : %s', url, exc) elif rule.type == 'at-rule' and rule.at_keyword == 'media': media = parse_media_query(rule.prelude) if media is None: LOGGER.warning( 'Invalid media type "%s" ' 'the whole @media rule was ignored at %s:%s.', tinycss2.serialize(rule.prelude), rule.source_line, rule.source_column) continue if not evaluate_media_query(media, device_media_type): continue content_rules = tinycss2.parse_rule_list(rule.content) preprocess_stylesheet(device_media_type, base_url, content_rules, url_fetcher, matcher, page_rules, fonts, font_config) elif rule.type == 'at-rule' and rule.at_keyword == 'page': tokens = remove_whitespace(rule.prelude) types = { 'side': None, 'blank': False, 'first': False, 'name': None } # TODO: Specificity is probably wrong, should clean and test that. if not tokens: specificity = (0, 0, 0) elif (len(tokens) == 2 and tokens[0].type == 'literal' and tokens[0].value == ':' and tokens[1].type == 'ident'): pseudo_class = tokens[1].lower_value if pseudo_class in ('left', 'right'): types['side'] = pseudo_class specificity = (0, 0, 1) elif pseudo_class in ('blank', 'first'): types[pseudo_class] = True specificity = (0, 1, 0) else: LOGGER.warning( 'Unknown @page pseudo-class "%s", ' 'the whole @page rule was ignored ' 'at %s:%s.', pseudo_class, rule.source_line, rule.source_column) continue elif len(tokens) == 1 and tokens[0].type == 'ident': types['name'] = tokens[0].value specificity = (1, 0, 0) else: LOGGER.warning( 'Unsupported @page selector "%s", ' 'the whole @page rule was ignored at %s:%s.', tinycss2.serialize(rule.prelude), rule.source_line, rule.source_column) continue page_type = PageType(**types) # Use a double lambda to have a closure that holds page_types match = (lambda page_type: lambda page_names: list( matching_page_types(page_type, names=page_names)))(page_type) content = tinycss2.parse_declaration_list(rule.content) declarations = list(preprocess_declarations(base_url, content)) if declarations: selector_list = [(specificity, None, match)] page_rules.append((rule, selector_list, declarations)) for margin_rule in content: if margin_rule.type != 'at-rule': continue declarations = list( preprocess_declarations( base_url, tinycss2.parse_declaration_list(margin_rule.content))) if declarations: selector_list = [(specificity, '@' + margin_rule.at_keyword, match)] page_rules.append( (margin_rule, selector_list, declarations)) elif rule.type == 'at-rule' and rule.at_keyword == 'font-face': content = tinycss2.parse_declaration_list(rule.content) rule_descriptors = dict(preprocess_descriptors(base_url, content)) for key in ('src', 'font_family'): if key not in rule_descriptors: LOGGER.warning( "Missing %s descriptor in '@font-face' rule at %s:%s", key.replace('_', '-'), rule.source_line, rule.source_column) break else: if font_config is not None: font_filename = font_config.add_font_face( rule_descriptors, url_fetcher) if font_filename: fonts.append(font_filename)