def test_basic(self): html = load_regression_data('basic-multi-page.html') urldict = self._make_basic_urldict() fetcher = urlfetch.MockUrlFetch(urldict) options = { 'url': 'http://basic.com/article.html', 'multipage': True, 'urlfetch': fetcher } doc = Document(html, **options) res = doc.summary_with_metadata() self.assertIn('Page 2', res.html, 'Should find the page 2 heading') self.assertIn('Page 3', res.html, 'Should find the page 3 heading') expected_html = load_regression_data('basic-multi-page-expected.html') diff_html = htmldiff(expected_html, res.html) diff_doc = document_fromstring(diff_html) insertions = diff_doc.xpath('//ins') deletions = diff_doc.xpath('//del') if len(insertions) != 0: for i in insertions: print('unexpected insertion: %s' % i.xpath('string()')) self.fail('readability result does not match expected') if len(deletions) != 0: for i in deletions: print('unexpected deletion: %s' % i.xpath('string()')) self.fail('readability result does not match expected')
def test_si_sample_full_summary(self): """We should parse the doc and get a full summary with confidence""" sample = load_sample('si-game.sample.html') doc = Document(sample, url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html') res = doc.summary_with_metadata(enclose_with_html_tag=False) self.assertTrue(hasattr(res, 'html'), 'res should have an html attrib') self.assertTrue(hasattr(res, 'confidence'), 'res should have an html attrib') self.assertTrue(hasattr(res, 'title'), 'res should have an titile attrib') self.assertTrue(hasattr(res, 'short_title'), 'res should have an short_title attrib') self.assertEqual('<div><div class="', res.html[0:17]) self.assertTrue(res.confidence > 50, 'The confidence score should be larger than 50: ' + str(res.confidence))