def transform(self, html=None, pretty_print=True, **kwargs): """change the html and return it with CSS turned into style attributes. """ if html is not None and self.html is not None: raise TypeError("Can't pass html argument twice") elif html is None and self.html is None: raise TypeError("must pass html as first argument") elif html is None: html = self.html if hasattr(html, "getroottree"): # skip the next bit root = html.getroottree() page = root tree = root else: if self.method == "xml": parser = etree.XMLParser(ns_clean=False, resolve_entities=False) else: parser = etree.HTMLParser() stripped = html.strip() tree = etree.fromstring(stripped, parser).getroottree() page = tree.getroot() # lxml inserts a doctype if none exists, so only include it in # the root if it was in the original html. root = tree if stripped.startswith(tree.docinfo.doctype) else page assert page is not None if self.disable_leftover_css: head = None else: head = get_or_create_head(tree) # # style selectors # rules = [] index = 0 cssselector = ["style"] if self.allow_network: cssselector.append("link[rel~=stylesheet]") for element in _create_cssselector(",".join(cssselector))(page): # If we have a media attribute whose value is anything other than # 'all' or 'screen', ignore the ruleset. media = element.attrib.get("media") if media and media not in ("all", "screen"): continue data_attribute = element.attrib.get(self.attribute_name) if data_attribute: if data_attribute == "ignore": del element.attrib[self.attribute_name] continue else: warnings.warn("Unrecognized %s attribute (%r)" % (self.attribute_name, data_attribute)) is_style = element.tag == "style" if is_style: css_body = element.text else: href = element.attrib.get("href") css_body = self._load_external(href) these_rules, these_leftover = self._parse_style_rules( css_body, index) index += 1 rules.extend(these_rules) parent_of_element = element.getparent() if these_leftover or self.keep_style_tags: if is_style: style = element else: style = etree.Element("style") style.attrib["type"] = "text/css" if self.keep_style_tags: style.text = css_body else: style.text = self._css_rules_to_string(these_leftover) if self.method == "xml": style.text = etree.CDATA(style.text) if not is_style: element.addprevious(style) parent_of_element.remove(element) elif not self.keep_style_tags or not is_style: parent_of_element.remove(element) # external style files if self.external_styles and self.allow_network: for stylefile in self.external_styles: css_body = self._load_external(stylefile) self._process_css_text(css_body, index, rules, head) index += 1 # css text if self.css_text: for css_body in self.css_text: self._process_css_text(css_body, index, rules, head) index += 1 # rules is a tuple of (specificity, selector, styles), where # specificity is a tuple ordered such that more specific # rules sort larger. rules.sort(key=operator.itemgetter(0)) # collecting all elements that we need to apply rules on # id is unique for the lifetime of the object # and lxml should give us the same everytime during this run # item id -> {item: item, classes: [], style: []} elements = {} for _, selector, style in rules: new_selector = selector class_ = "" if ":" in selector: new_selector, class_ = re.split(":", selector, 1) class_ = ":%s" % class_ # Keep filter-type selectors untouched. if class_ in FILTER_PSEUDOSELECTORS or class_.startswith( ":nth-child"): class_ = "" else: selector = new_selector assert selector try: sel = _create_cssselector(selector) except SelectorSyntaxError: # TODO: this should be optional next items = sel(page) if len(items): # same so process it first processed_style = csstext_to_pairs( style, validate=not self.disable_validation) for item in items: item_id = id(item) if item_id not in elements: elements[item_id] = { "item": item, "classes": [], "style": [] } elements[item_id]["style"].append(processed_style) elements[item_id]["classes"].append(class_) # Now apply inline style # merge style only once for each element # crucial when you have a lot of pseudo/classes # and a long list of elements for _, element in elements.items(): final_style = merge_styles( element["item"].attrib.get("style", ""), element["style"], element["classes"], remove_unset_properties=self.remove_unset_properties, ) if final_style: # final style could be empty string because of # remove_unset_properties element["item"].attrib["style"] = final_style self._style_to_basic_html_attributes(element["item"], final_style, force=True) if self.remove_classes: # now we can delete all 'class' attributes for item in page.xpath("//@class"): parent = item.getparent() del parent.attrib["class"] # Capitalize Margin properties # To fix weird outlook bug # https://www.emailonacid.com/blog/article/email-development/outlook.com-does-support-margins if self.capitalize_float_margin: for item in page.xpath("//@style"): mangled = capitalize_float_margin(item) item.getparent().attrib["style"] = mangled # Add align attributes to images if they have a CSS float value of # right or left. Outlook (both on desktop and on the web) are bad at # understanding floats, but they do understand the HTML align attrib. if self.align_floating_images: for item in page.xpath("//img[@style]"): image_css = cssutils.parseStyle(item.attrib["style"]) if image_css.float == "right": item.attrib["align"] = "right" elif image_css.float == "left": item.attrib["align"] = "left" # # URLs # if self.base_url and not self.disable_link_rewrites: if not urlparse(self.base_url).scheme: raise ValueError("Base URL must have a scheme") for attr in ("href", "src"): for item in page.xpath("//@%s" % attr): parent = item.getparent() url = parent.attrib[attr] if (attr == "href" and self.preserve_internal_links and url.startswith("#")): continue if (attr == "src" and self.preserve_inline_attachments and url.startswith("cid:")): continue if attr == "href" and url.startswith("tel:"): continue parent.attrib[attr] = urljoin(self.base_url, url) if hasattr(html, "getroottree"): return root else: kwargs.setdefault("method", self.method) kwargs.setdefault("pretty_print", pretty_print) kwargs.setdefault("encoding", "utf-8") # As Ken Thompson intended out = etree.tostring(root, **kwargs).decode(kwargs["encoding"]) if self.method == "xml": out = _cdata_regex.sub( lambda m: "/*<![CDATA[*/%s/*]]>*/" % m.group(1), out) if self.strip_important: out = _importants.sub("", out) return out
def transform(self, pretty_print=True, **kwargs): """change the self.html and return it with CSS turned into style attributes. """ if hasattr(self.html, "getroottree"): # skip the next bit root = self.html.getroottree() page = root tree = root else: if self.method == 'xml': parser = etree.XMLParser( ns_clean=False, resolve_entities=False ) else: parser = etree.HTMLParser() stripped = self.html.strip() tree = etree.fromstring(stripped, parser).getroottree() page = tree.getroot() # lxml inserts a doctype if none exists, so only include it in # the root if it was in the original html. root = tree if stripped.startswith(tree.docinfo.doctype) else page assert page is not None if self.disable_leftover_css: head = None else: head = get_or_create_head(tree) # # style selectors # rules = [] index = 0 for element in CSSSelector('style,link[rel~=stylesheet]')(page): # If we have a media attribute whose value is anything other than # 'all' or 'screen', ignore the ruleset. media = element.attrib.get('media') if media and media not in ('all', 'screen'): continue data_attribute = element.attrib.get(self.attribute_name) if data_attribute: if data_attribute == 'ignore': del element.attrib[self.attribute_name] continue else: warnings.warn( 'Unrecognized %s attribute (%r)' % ( self.attribute_name, data_attribute, ) ) is_style = element.tag == 'style' if is_style: css_body = element.text else: href = element.attrib.get('href') css_body = self._load_external(href) these_rules, these_leftover = self._parse_style_rules( css_body, index ) index += 1 rules.extend(these_rules) parent_of_element = element.getparent() if these_leftover or self.keep_style_tags: if is_style: style = element else: style = etree.Element('style') style.attrib['type'] = 'text/css' if self.keep_style_tags: style.text = css_body else: style.text = self._css_rules_to_string(these_leftover) if self.method == 'xml': style.text = etree.CDATA(style.text) if not is_style: element.addprevious(style) parent_of_element.remove(element) elif not self.keep_style_tags or not is_style: parent_of_element.remove(element) # external style files if self.external_styles: for stylefile in self.external_styles: css_body = self._load_external(stylefile) self._process_css_text(css_body, index, rules, head) index += 1 # css text if self.css_text: for css_body in self.css_text: self._process_css_text(css_body, index, rules, head) index += 1 # rules is a tuple of (specificity, selector, styles), where # specificity is a tuple ordered such that more specific # rules sort larger. rules.sort(key=operator.itemgetter(0)) # collecting all elements that we need to apply rules on # id is unique for the lifetime of the object # and lxml should give us the same everytime during this run # item id -> {item: item, classes: [], style: []} elements = {} for _, selector, style in rules: new_selector = selector class_ = '' if ':' in selector: new_selector, class_ = re.split(':', selector, 1) class_ = ':%s' % class_ # Keep filter-type selectors untouched. if class_ in FILTER_PSEUDOSELECTORS: class_ = '' else: selector = new_selector sel = CSSSelector(selector) items = sel(page) if len(items): # same so process it first processed_style = csstext_to_pairs(style) for item in items: item_id = id(item) if item_id not in elements: elements[item_id] = { 'item': item, 'classes': [], 'style': [], } elements[item_id]['style'].append(processed_style) elements[item_id]['classes'].append(class_) # Now apply inline style # merge style only once for each element # crucial when you have a lot of pseudo/classes # and a long list of elements for _, element in elements.items(): final_style = merge_styles( element['item'].attrib.get('style', ''), element['style'], element['classes'], remove_unset_properties=self.remove_unset_properties, ) if final_style: # final style could be empty string because of # remove_unset_properties element['item'].attrib['style'] = final_style self._style_to_basic_html_attributes( element['item'], final_style, force=True ) if self.remove_classes: # now we can delete all 'class' attributes for item in page.xpath('//@class'): parent = item.getparent() del parent.attrib['class'] # Add align attributes to images if they have a CSS float value of # right or left. Outlook (both on desktop and on the web) are bad at # understanding floats, but they do understand the HTML align attrib. if self.align_floating_images: for item in page.xpath('//img[@style]'): image_css = cssutils.parseStyle(item.attrib['style']) if image_css.float == 'right': item.attrib['align'] = 'right' elif image_css.float == 'left': item.attrib['align'] = 'left' # # URLs # if self.base_url: if not urlparse(self.base_url).scheme: raise ValueError('Base URL must have a scheme') for attr in ('href', 'src'): for item in page.xpath("//@%s" % attr): parent = item.getparent() url = parent.attrib[attr] if ( attr == 'href' and self.preserve_internal_links and url.startswith('#') ): continue if ( attr == 'src' and self.preserve_inline_attachments and url.startswith('cid:') ): continue parent.attrib[attr] = urljoin(self.base_url, url) if hasattr(self.html, "getroottree"): return root else: kwargs.setdefault('method', self.method) kwargs.setdefault('pretty_print', pretty_print) kwargs.setdefault('encoding', 'utf-8') # As Ken Thompson intended out = etree.tostring(root, **kwargs).decode(kwargs['encoding']) if self.method == 'xml': out = _cdata_regex.sub( lambda m: '/*<![CDATA[*/%s/*]]>*/' % m.group(1), out ) if self.strip_important: out = _importants.sub('', out) return out
def test_csstext_to_pairs(self): csstext = 'font-size:1px' parsed_csstext = csstext_to_pairs(csstext) self.assertEqual(('font-size', '1px'), parsed_csstext[0])
def test_important_rule(self): # No exception after #133 csstext = 'font-size:1px !important' parsed_csstext = csstext_to_pairs(csstext) self.assertEqual(('font-size', '1px'), parsed_csstext[0])
def transform(self, pretty_print=True, **kwargs): """change the self.html and return it with CSS turned into style attributes. """ if hasattr(self.html, "getroottree"): # skip the next bit root = self.html.getroottree() page = root tree = root else: if self.method == 'xml': parser = etree.XMLParser(ns_clean=False, resolve_entities=False) else: parser = etree.HTMLParser() stripped = self.html.strip() tree = etree.fromstring(stripped, parser).getroottree() page = tree.getroot() # lxml inserts a doctype if none exists, so only include it in # the root if it was in the original html. root = tree if stripped.startswith(tree.docinfo.doctype) else page assert page is not None if self.disable_leftover_css: head = None else: head = get_or_create_head(tree) # # style selectors # rules = [] index = 0 for element in CSSSelector('style,link[rel~=stylesheet]')(page): # If we have a media attribute whose value is anything other than # 'all' or 'screen', ignore the ruleset. media = element.attrib.get('media') if media and media not in ('all', 'screen'): continue data_attribute = element.attrib.get(self.attribute_name) if data_attribute: if data_attribute == 'ignore': del element.attrib[self.attribute_name] continue else: warnings.warn('Unrecognized %s attribute (%r)' % ( self.attribute_name, data_attribute, )) is_style = element.tag == 'style' if is_style: css_body = element.text else: href = element.attrib.get('href') css_body = self._load_external(href) these_rules, these_leftover = self._parse_style_rules( css_body, index) index += 1 rules.extend(these_rules) parent_of_element = element.getparent() if these_leftover or self.keep_style_tags: if is_style: style = element else: style = etree.Element('style') style.attrib['type'] = 'text/css' if self.keep_style_tags: style.text = css_body else: style.text = self._css_rules_to_string(these_leftover) if self.method == 'xml': style.text = etree.CDATA(style.text) if not is_style: element.addprevious(style) parent_of_element.remove(element) elif not self.keep_style_tags or not is_style: parent_of_element.remove(element) # external style files if self.external_styles: for stylefile in self.external_styles: css_body = self._load_external(stylefile) self._process_css_text(css_body, index, rules, head) index += 1 # css text if self.css_text: for css_body in self.css_text: self._process_css_text(css_body, index, rules, head) index += 1 # rules is a tuple of (specificity, selector, styles), where # specificity is a tuple ordered such that more specific # rules sort larger. rules.sort(key=operator.itemgetter(0)) # collecting all elements that we need to apply rules on # id is unique for the lifetime of the object # and lxml should give us the same everytime during this run # item id -> {item: item, classes: [], style: []} elements = {} for _, selector, style in rules: new_selector = selector class_ = '' if ':' in selector: new_selector, class_ = re.split(':', selector, 1) class_ = ':%s' % class_ # Keep filter-type selectors untouched. if class_ in FILTER_PSEUDOSELECTORS: class_ = '' else: selector = new_selector sel = CSSSelector(selector) items = sel(page) if len(items): # same so process it first processed_style = csstext_to_pairs(style) for item in items: item_id = id(item) if item_id not in elements: elements[item_id] = { 'item': item, 'classes': [], 'style': [], } elements[item_id]['style'].append(processed_style) elements[item_id]['classes'].append(class_) # Now apply inline style # merge style only once for each element # crucial when you have a lot of pseudo/classes # and a long list of elements for _, element in elements.items(): final_style = merge_styles( element['item'].attrib.get('style', ''), element['style'], element['classes'], remove_unset_properties=self.remove_unset_properties, ) if final_style: # final style could be empty string because of # remove_unset_properties element['item'].attrib['style'] = final_style self._style_to_basic_html_attributes(element['item'], final_style, force=True) if self.remove_classes: # now we can delete all 'class' attributes for item in page.xpath('//@class'): parent = item.getparent() del parent.attrib['class'] # Capitalize Margin properties # To fix weird outlook bug # https://www.emailonacid.com/blog/article/email-development/outlook.com-does-support-margins if self.capitalize_float_margin: for item in page.xpath('//@style'): mangled = capitalize_float_margin(item) item.getparent().attrib['style'] = mangled # Add align attributes to images if they have a CSS float value of # right or left. Outlook (both on desktop and on the web) are bad at # understanding floats, but they do understand the HTML align attrib. if self.align_floating_images: for item in page.xpath('//img[@style]'): image_css = cssutils.parseStyle(item.attrib['style']) if image_css.float == 'right': item.attrib['align'] = 'right' elif image_css.float == 'left': item.attrib['align'] = 'left' # # URLs # if self.base_url: if not urlparse(self.base_url).scheme: raise ValueError('Base URL must have a scheme') for attr in ('href', 'src'): for item in page.xpath("//@%s" % attr): parent = item.getparent() url = parent.attrib[attr] if (attr == 'href' and self.preserve_internal_links and url.startswith('#')): continue if (attr == 'src' and self.preserve_inline_attachments and url.startswith('cid:')): continue if attr == 'href' and url.startswith('tel:'): continue parent.attrib[attr] = urljoin(self.base_url, url) if hasattr(self.html, "getroottree"): return root else: kwargs.setdefault('method', self.method) kwargs.setdefault('pretty_print', pretty_print) kwargs.setdefault('encoding', 'utf-8') # As Ken Thompson intended out = etree.tostring(root, **kwargs).decode(kwargs['encoding']) if self.method == 'xml': out = _cdata_regex.sub( lambda m: '/*<![CDATA[*/%s/*]]>*/' % m.group(1), out) if self.strip_important: out = _importants.sub('', out) return out
def test_csstext_to_pairs(self): csstext = "font-size:1px" parsed_csstext = csstext_to_pairs(csstext) self.assertEqual(("font-size", "1px"), parsed_csstext[0])
def transform(self, html=None, pretty_print=True, **kwargs): """change the html and return it with CSS turned into style attributes. """ if html is not None and self.html is not None: raise TypeError("Can't pass html argument twice") elif html is None and self.html is None: raise TypeError("must pass html as first argument") elif html is None: html = self.html if hasattr(html, "getroottree"): # skip the next bit root = html.getroottree() page = root tree = root else: if self.method == "xml": parser = etree.XMLParser(ns_clean=False, resolve_entities=False) else: parser = etree.HTMLParser() stripped = html.strip() tree = etree.fromstring(stripped, parser).getroottree() page = tree.getroot() # lxml inserts a doctype if none exists, so only include it in # the root if it was in the original html. root = tree if stripped.startswith(tree.docinfo.doctype) else page assert page is not None if self.disable_leftover_css: head = None else: head = get_or_create_head(tree) # # style selectors # rules = [] index = 0 cssselector = ["style"] if self.allow_network: cssselector.append("link[rel~=stylesheet]") for element in _create_cssselector(",".join(cssselector))(page): # If we have a media attribute whose value is anything other than # 'all' or 'screen', ignore the ruleset. media = element.attrib.get("media") if media and media not in ("all", "screen"): continue data_attribute = element.attrib.get(self.attribute_name) if data_attribute: if data_attribute == "ignore": del element.attrib[self.attribute_name] continue else: warnings.warn( "Unrecognized %s attribute (%r)" % (self.attribute_name, data_attribute) ) is_style = element.tag == "style" if is_style: css_body = element.text else: href = element.attrib.get("href") css_body = self._load_external(href) these_rules, these_leftover = self._parse_style_rules(css_body, index) index += 1 rules.extend(these_rules) parent_of_element = element.getparent() if these_leftover or self.keep_style_tags: if is_style: style = element else: style = etree.Element("style") style.attrib["type"] = "text/css" if self.keep_style_tags: style.text = css_body else: style.text = self._css_rules_to_string(these_leftover) if self.method == "xml": style.text = etree.CDATA(style.text) if not is_style: element.addprevious(style) parent_of_element.remove(element) elif not self.keep_style_tags or not is_style: parent_of_element.remove(element) # external style files if self.external_styles and self.allow_network: for stylefile in self.external_styles: css_body = self._load_external(stylefile) self._process_css_text(css_body, index, rules, head) index += 1 # css text if self.css_text: for css_body in self.css_text: self._process_css_text(css_body, index, rules, head) index += 1 # rules is a tuple of (specificity, selector, styles), where # specificity is a tuple ordered such that more specific # rules sort larger. rules.sort(key=operator.itemgetter(0)) # collecting all elements that we need to apply rules on # id is unique for the lifetime of the object # and lxml should give us the same everytime during this run # item id -> {item: item, classes: [], style: []} elements = {} for _, selector, style in rules: new_selector = selector class_ = "" if ":" in selector: new_selector, class_ = re.split(":", selector, 1) class_ = ":%s" % class_ # Keep filter-type selectors untouched. if class_ in FILTER_PSEUDOSELECTORS or class_.startswith(":nth-child"): class_ = "" else: selector = new_selector assert selector sel = _create_cssselector(selector) items = sel(page) if len(items): # same so process it first processed_style = csstext_to_pairs(style) for item in items: item_id = id(item) if item_id not in elements: elements[item_id] = {"item": item, "classes": [], "style": []} elements[item_id]["style"].append(processed_style) elements[item_id]["classes"].append(class_) # Now apply inline style # merge style only once for each element # crucial when you have a lot of pseudo/classes # and a long list of elements for _, element in elements.items(): final_style = merge_styles( element["item"].attrib.get("style", ""), element["style"], element["classes"], remove_unset_properties=self.remove_unset_properties, ) if final_style: # final style could be empty string because of # remove_unset_properties element["item"].attrib["style"] = final_style self._style_to_basic_html_attributes( element["item"], final_style, force=True ) if self.remove_classes: # now we can delete all 'class' attributes for item in page.xpath("//@class"): parent = item.getparent() del parent.attrib["class"] # Capitalize Margin properties # To fix weird outlook bug # https://www.emailonacid.com/blog/article/email-development/outlook.com-does-support-margins if self.capitalize_float_margin: for item in page.xpath("//@style"): mangled = capitalize_float_margin(item) item.getparent().attrib["style"] = mangled # Add align attributes to images if they have a CSS float value of # right or left. Outlook (both on desktop and on the web) are bad at # understanding floats, but they do understand the HTML align attrib. if self.align_floating_images: for item in page.xpath("//img[@style]"): image_css = cssutils.parseStyle(item.attrib["style"]) if image_css.float == "right": item.attrib["align"] = "right" elif image_css.float == "left": item.attrib["align"] = "left" # # URLs # if self.base_url and not self.disable_link_rewrites: if not urlparse(self.base_url).scheme: raise ValueError("Base URL must have a scheme") for attr in ("href", "src"): for item in page.xpath("//@%s" % attr): parent = item.getparent() url = parent.attrib[attr] if ( attr == "href" and self.preserve_internal_links and url.startswith("#") ): continue if ( attr == "src" and self.preserve_inline_attachments and url.startswith("cid:") ): continue if attr == "href" and url.startswith("tel:"): continue parent.attrib[attr] = urljoin(self.base_url, url) if hasattr(html, "getroottree"): return root else: kwargs.setdefault("method", self.method) kwargs.setdefault("pretty_print", pretty_print) kwargs.setdefault("encoding", "utf-8") # As Ken Thompson intended out = etree.tostring(root, **kwargs).decode(kwargs["encoding"]) if self.method == "xml": out = _cdata_regex.sub( lambda m: "/*<![CDATA[*/%s/*]]>*/" % m.group(1), out ) if self.strip_important: out = _importants.sub("", out) return out