def test_dont_transform_div_with_div(): """Verify that only child <div> element is replaced by <p>.""" dom = document_fromstring("<html><body><div>text<div>child</div>" "aftertext</div></body></html>") assert tounicode(leaf_div_elements_into_paragraphs(dom)) == to_unicode( "<html><body><div>text<p>child</p>" "aftertext</div></body></html>")
def test_misused_divs_transform(): """Verify we replace leaf node divs with p's They should have the same content, just be a p vs a div """ test_html = "<html><body><div>simple</div></body></html>" test_doc = document_fromstring(test_html) assert tounicode(leaf_div_elements_into_paragraphs( test_doc)) == to_unicode("<html><body><p>simple</p></body></html>") test_html2 = ('<html><body><div>simple<a href="">link</a>' '</div></body></html>') test_doc2 = document_fromstring(test_html2) assert tounicode( leaf_div_elements_into_paragraphs(test_doc2)) == to_unicode( '<html><body><p>simple<a href="">link</a></p></body></html>')
def test_misused_divs_transform(): """Verify we replace leaf node divs with p's They should have the same content, just be a p vs a div """ test_html = "<html><body><div>simple</div></body></html>" test_doc = document_fromstring(test_html) assert tounicode(leaf_div_elements_into_paragraphs(test_doc)) == to_unicode( "<html><body><p>simple</p></body></html>" ) test_html2 = ('<html><body><div>simple<a href="">link</a>' '</div></body></html>') test_doc2 = document_fromstring(test_html2) assert tounicode(leaf_div_elements_into_paragraphs(test_doc2)) == to_unicode( '<html><body><p>simple<a href="">link</a></p></body></html>' )
def dom(self): """Parsed lxml tree (Document Object Model) of the given html.""" try: dom = self._original_document.dom # cleaning doesn't return, just wipes in place html_cleaner(dom) return leaf_div_elements_into_paragraphs(dom) except ValueError: return None
def test_dont_transform_div_with_div(): """Verify that only child <div> element is replaced by <p>.""" dom = document_fromstring( "<html><body><div>text<div>child</div>" "aftertext</div></body></html>" ) assert tounicode(leaf_div_elements_into_paragraphs(dom)) == to_unicode( "<html><body><div>text<p>child</p>" "aftertext</div></body></html>" )