def test_dont_transform_div_with_div(): """Verify that only child <div> element is replaced by <p>.""" dom = document_fromstring("<html><body><div>text<div>child</div>" "aftertext</div></body></html>") assert tounicode(leaf_div_elements_into_paragraphs(dom)) == to_unicode( "<html><body><div>text<p>child</p>" "aftertext</div></body></html>")
def test_misused_divs_transform(): """Verify we replace leaf node divs with p's They should have the same content, just be a p vs a div """ test_html = "<html><body><div>simple</div></body></html>" test_doc = document_fromstring(test_html) assert tounicode(leaf_div_elements_into_paragraphs( test_doc)) == to_unicode("<html><body><p>simple</p></body></html>") test_html2 = ('<html><body><div>simple<a href="">link</a>' '</div></body></html>') test_doc2 = document_fromstring(test_html2) assert tounicode( leaf_div_elements_into_paragraphs(test_doc2)) == to_unicode( '<html><body><p>simple<a href="">link</a></p></body></html>')
def test_misused_divs_transform(): """Verify we replace leaf node divs with p's They should have the same content, just be a p vs a div """ test_html = "<html><body><div>simple</div></body></html>" test_doc = document_fromstring(test_html) assert tounicode(leaf_div_elements_into_paragraphs(test_doc)) == to_unicode( "<html><body><p>simple</p></body></html>" ) test_html2 = ('<html><body><div>simple<a href="">link</a>' '</div></body></html>') test_doc2 = document_fromstring(test_html2) assert tounicode(leaf_div_elements_into_paragraphs(test_doc2)) == to_unicode( '<html><body><p>simple<a href="">link</a></p></body></html>' )
def test_dont_transform_div_with_div(): """Verify that only child <div> element is replaced by <p>.""" dom = document_fromstring( "<html><body><div>text<div>child</div>" "aftertext</div></body></html>" ) assert tounicode(leaf_div_elements_into_paragraphs(dom)) == to_unicode( "<html><body><div>text<p>child</p>" "aftertext</div></body></html>" )
def test_readin_with_base_url(): """Passing a url should update links to be absolute links""" doc = OriginalDocument(load_snippet('document_absolute_url.html'), url="http://blog.mitechie.com/test.html") assert to_unicode(doc).startswith('<html>') # find the links on the page and make sure each one starts with out # base url we told it to use. links = doc.links assert len(links) == 3 # we should have two links that start with our blog url # and one link that starts with amazon link_counts = defaultdict(int) for link in links: if link.get('href').startswith('http://blog.mitechie.com'): link_counts['blog'] += 1 else: link_counts['other'] += 1 assert link_counts['blog'] == 2 assert link_counts['other'] == 1
def test_readin_with_base_url(self): """Passing a url should update links to be absolute links""" doc = OriginalDocument( load_snippet('document_absolute_url.html'), url="http://blog.mitechie.com/test.html") self.assertTrue(to_unicode(doc).startswith('<html>')) # find the links on the page and make sure each one starts with out # base url we told it to use. links = doc.links self.assertEqual(len(links), 3) # we should have two links that start with our blog url # and one link that starts with amazon link_counts = defaultdict(int) for link in links: if link.get('href').startswith('http://blog.mitechie.com'): link_counts['blog'] += 1 else: link_counts['other'] += 1 self.assertEqual(link_counts['blog'], 2) self.assertEqual(link_counts['other'], 1)
def test_readin_min_document(): """Verify we can read in a min html document""" doc = OriginalDocument(load_snippet('document_min.html')) assert to_unicode(doc).startswith('<html>') assert doc.title == 'Min Document Title'
def test_readin_min_document(self): """Verify we can read in a min html document""" doc = OriginalDocument(load_snippet('document_min.html')) self.assertTrue(to_unicode(doc).startswith('<html>')) self.assertEqual(doc.title, 'Min Document Title')