def test_basic(self):
        html = load_regression_data('basic-multi-page.html')
        urldict = self._make_basic_urldict()
        fetcher = urlfetch.MockUrlFetch(urldict)
        options = {
                'url': 'http://basic.com/article.html',
                'multipage': True,
                'urlfetch': fetcher
                }
        doc = Document(html, **options)
        res = doc.summary_with_metadata()

        self.assertIn('Page 2', res.html, 'Should find the page 2 heading')
        self.assertIn('Page 3', res.html, 'Should find the page 3 heading')

        expected_html = load_regression_data('basic-multi-page-expected.html')
        diff_html = htmldiff(expected_html, res.html)
        diff_doc = document_fromstring(diff_html)

        insertions = diff_doc.xpath('//ins')
        deletions = diff_doc.xpath('//del')

        if len(insertions) != 0:
            for i in insertions:
                print('unexpected insertion: %s' % i.xpath('string()'))
            self.fail('readability result does not match expected')

        if len(deletions) != 0:
            for i in deletions:
                print('unexpected deletion: %s' % i.xpath('string()'))
            self.fail('readability result does not match expected')
 def test_duplicate(self):
     html = load_regression_data('duplicate-page-duplicate.html')
     page = r.fragment_fromstring(html)
     self.assertTrue(r.is_suspected_duplicate(self._article, page))
 def setUp(self):
     super(TestIsSuspectedDuplicate, self).setUp()
     html = load_regression_data('duplicate-page-article.html')
     self._article = r.fragment_fromstring(html)
 def _test_page(self, url, html_path, expected):
     html = load_regression_data(html_path)
     doc = r.parse(html, url)
     parsed_urls = {url}
     actual = r.find_next_page_url(parsed_urls, url, doc)
     self.assertEqual(expected, actual)