def test_doc_no_scripts_styles(self): """Step #1 remove all scripts from the document""" doc = Article(load_snippet('document_scripts.html')) readable = doc._readable self.assertEqual(readable.findall(".//script"), []) self.assertEqual(readable.findall(".//style"), []) self.assertEqual(readable.findall(".//link"), [])
def test_unlikely_hits(self): """Verify we wipe out things from our unlikely list.""" doc = Article(load_snippet('test_readable_unlikely.html')) readable = doc._readable must_not_appear = ['comment', 'community', 'disqus', 'extra', 'foot', 'header', 'menu', 'remark', 'rss', 'shoutbox', 'sidebar', 'sponsor', 'ad-break', 'agegate', 'pagination' '', 'pager', 'popup', 'tweet', 'twitter', 'imgBlogpostPermalink'] want_to_appear = ['and', 'article', 'body', 'column', 'main', 'shadow'] for i in must_not_appear: # we cannot find any class or id with this value by_class = readable.find_class(i) for test in by_class: # if it's here it cannot have the must not class without the # want to appear class found = False for cls in test.get('class').split(): if cls in want_to_appear: found = True self.assertTrue(found) by_ids = readable.get_element_by_id(i, False) if by_ids is not False: found = False for ids in test.get('id').split(): if ids in want_to_appear: found = True self.assertTrue(found)
def test_unlikely_hits(self): """Verify we wipe out things from our unlikely list.""" doc = Article(load_snippet('test_readable_unlikely.html')) readable = doc._readable must_not_appear = [ 'comment', 'community', 'disqus', 'extra', 'foot', 'header', 'menu', 'remark', 'rss', 'shoutbox', 'sidebar', 'sponsor', 'ad-break', 'agegate', 'pagination' '', 'pager', 'popup', 'tweet', 'twitter', 'imgBlogpostPermalink' ] want_to_appear = ['and', 'article', 'body', 'column', 'main', 'shadow'] for i in must_not_appear: # we cannot find any class or id with this value by_class = readable.find_class(i) for test in by_class: # if it's here it cannot have the must not class without the # want to appear class found = False for cls in test.get('class').split(): if cls in want_to_appear: found = True self.assertTrue(found) by_ids = readable.get_element_by_id(i, False) if by_ids is not False: found = False for ids in test.get('id').split(): if ids in want_to_appear: found = True self.assertTrue(found)
def test_bare_content(self): """If the document is just pure content, no html tags we should be ok We build our doc around the rest of the html we parsed. """ doc = Article(load_snippet('document_only_content.html')) self.assertEqual(doc._readable.tag, 'div') self.assertEqual(doc._readable.get('id'), 'readabilityBody')
def test_find_body_exists(self): """If the document has a body, we store that as the readable html No sense processing anything other than the body content. """ doc = Article(load_snippet('document_min.html')) self.assertEqual(doc._readable.tag, 'div') self.assertEqual(doc._readable.get('id'), 'readabilityBody')
def test_body_doesnt_exist(self): """If we can't find a body, then we create one. We build our doc around the rest of the html we parsed. """ doc = Article(load_snippet('document_no_body.html')) self.assertEqual(doc._readable.tag, 'div') self.assertEqual(doc._readable.get('id'), 'readabilityBody')
def test_readin_with_base_url(self): """Passing a url should update links to be absolute links""" doc = OriginalDocument(load_snippet('document_absolute_url.html'), url="http://blog.mitechie.com/test.html") self.assertTrue(str(doc).startswith(u'<html>')) # find the links on the page and make sure each one starts with out # base url we told it to use. links = doc.links self.assertEqual(len(links), 3) # we should have two links that start with our blog url # and one link that starts with amazon link_counts = defaultdict(int) for link in links: if link.get('href').startswith('http://blog.mitechie.com'): link_counts['blog'] += 1 else: link_counts['other'] += 1 self.assertEqual(link_counts['blog'], 2) self.assertEqual(link_counts['other'], 1)
def test_readin_with_base_url(self): """Passing a url should update links to be absolute links""" doc = OriginalDocument( load_snippet('document_absolute_url.html'), url="http://blog.mitechie.com/test.html") self.assertTrue(str(doc).startswith(u'<html>')) # find the links on the page and make sure each one starts with out # base url we told it to use. links = doc.links self.assertEqual(len(links), 3) # we should have two links that start with our blog url # and one link that starts with amazon link_counts = defaultdict(int) for link in links: if link.get('href').startswith('http://blog.mitechie.com'): link_counts['blog'] += 1 else: link_counts['other'] += 1 self.assertEqual(link_counts['blog'], 2) self.assertEqual(link_counts['other'], 1)
def test_readin_min_document(self): """Verify we can read in a min html document""" doc = OriginalDocument(load_snippet('document_min.html')) self.assertTrue(str(doc).startswith(u'<html>')) self.assertEqual(doc.title, 'Min Document Title')
def test_several_links(self): """This doc has a 3 links with the majority of content.""" doc = Article(load_snippet('document_absolute_url.html')) self.assertAlmostEqual(get_link_density(doc._readable), 0.349, places=3)
def test_small_doc_no_links(self): doc = Article(load_snippet('document_min.html')) assert 0 == get_link_density(doc._readable), "Still no link density"
def test_several_links(self): """This doc has a 3 links with the majority of content.""" doc = Article(load_snippet('document_absolute_url.html')) self.assertAlmostEqual( get_link_density(doc._readable), 0.349, places=3)
def test_load_doc(self): """We get back an element tree from our original doc""" doc = Article(load_snippet('document_min.html')) # We get back the document as a div tag currently by default. self.assertEqual(doc._readable.tag, 'div')
def test_no_br_allowed(self): """We convert all <br/> tags to <p> tags""" doc = OriginalDocument(load_snippet('document_min.html')) self.assertIsNone(doc.html.find('.//br'))