Exemplo n.º 1
0
class TestPageOCR(TestCase):

    """Test page ocr functions."""

    family = 'wikisource'
    code = 'en'

    cached = True

    data = {'title': 'Page:Popular Science Monthly Volume 1.djvu/10',
            'hocr': (False, 'ENTERED, according to Act of Congress, in the '
                            'year 1872,\nBY D. APPLETON & CO.,\nIn the Office '
                            'of the Librarian of Congress, at '
                            'Washington.\n\n'),
            'ocr': (False, 'lam-mam, according to Act of Congress, in the '
                           'year 157-2,\nBY D. APPLEION Av CO.,\nIn the '
                           'Of\ufb01ce or the Librarian of '
                           'Congress, at Washington.\n\n'),
            'googleOCR': (False, 'ENTERED, according to Act of Congress, in '
                                 'the year 1572,\nBY D. APPLETON & CO.\n'
                                 'In the Office of the Librarian of '
                                 'Congress, at Washington.\n4 334\n'),
            }

    def setUp(self):
        """Test setUp."""
        site = self.get_site()
        title = self.data['title']
        self.page = ProofreadPage(site, title)
        super(TestPageOCR, self).setUp()

    def test_ocr_exceptions(self):
        """Test page.ocr() exceptions."""
        self.assertRaises(TypeError, self.page.ocr, ocr_tool='dummy')

    def test_do_hocr(self):
        """Test page._do_hocr()."""
        error, text = self.page._do_hocr()
        ref_error, ref_text = self.data['hocr']
        self.assertEqual(error, ref_error)
        self.assertEqual(text, ref_text)

    def test_do_ocr_phetools_raw_request(self):
        """Test page._do_ocr connection with wmflabs."""
        uri = ('https://tools.wmflabs.org/phetools/ocr.php?cmd=ocr'
               '&url=https://upload.wikimedia.org/wikipedia/commons/'
               'thumb/a/ac/Popular_Science_Monthly_Volume_1.djvu/'
               'page10-1024px-Popular_Science_Monthly_Volume_1.djvu.jpg'
               '&lang=en&user=None')
        response = http.fetch(uri)
        self.assertEqual(response.status, 200)

    def test_do_ocr_phetools(self):
        """Test page._do_ocr(ocr_tool='phetools')."""
        error, text = self.page._do_ocr(ocr_tool='phetools')
        ref_error, ref_text = self.data['ocr']
        self.assertEqual(error, ref_error)
        self.assertEqual(text, ref_text)

    def test_do_ocr_googleocr(self):
        """Test page._do_ocr(ocr_tool='googleOCR')."""
        error, text = self.page._do_ocr(ocr_tool='googleOCR')
        ref_error, ref_text = self.data['googleOCR']
        self.assertEqual(error, ref_error)
        self.assertEqual(text, ref_text)

    def test_ocr_googleocr(self):
        """Test page.ocr(ocr_tool='googleOCR')."""
        text = self.page.ocr(ocr_tool='googleOCR')
        ref_error, ref_text = self.data['googleOCR']
        self.assertEqual(text, ref_text)
Exemplo n.º 2
0
class TestPageOCR(BS4TestCase):
    """Test page ocr functions."""

    family = 'wikisource'
    code = 'en'

    cached = True

    data = {
        'title':
        'Page:Popular Science Monthly Volume 1.djvu/10',
        'hocr': (False, 'ENTERED, according to Act of Congress, in the '
                 'year 1872,\nBY D. APPLETON & CO.,\nIn the Office '
                 'of the Librarian of Congress, at '
                 'Washington.\n\n'),
        'ocr': (False, 'EsTEnen, according to Act of Congress, in the '
                'year 1872,\nBy D. APPLETON & CO.,\nIn the '
                'Office of the Librarian of Congress, at '
                'Washington.\n\u000c'),
        'googleOCR': (False, 'ENTERED, according to Act of Congress, in '
                      'the year 1572,\nBY D. APPLETON & CO.\n'
                      'In the Office of the Librarian of '
                      'Congress, at Washington.\n4 334\n'),
    }

    def setUp(self):
        """Test setUp."""
        site = self.get_site()
        title = self.data['title']
        self.page = ProofreadPage(site, title)
        super().setUp()

    def test_ocr_exceptions(self):
        """Test page.ocr() exceptions."""
        self.assertRaises(TypeError, self.page.ocr, ocr_tool='dummy')

    def test_do_hocr(self):
        """Test page._do_hocr()."""
        error, text = self.page._do_hocr()
        if error:
            self.skipTest(text)
        ref_error, ref_text = self.data['hocr']
        self.assertEqual(error, ref_error)
        s = difflib.SequenceMatcher(None, text, ref_text)
        self.assertGreater(s.ratio(), 0.9)

    def test_do_ocr_phetools(self):
        """Test page._do_ocr(ocr_tool='phetools')."""
        error, text = self.page._do_ocr(ocr_tool='phetools')
        ref_error, ref_text = self.data['ocr']
        if error:
            self.skipTest(text)
        self.assertEqual(error, ref_error)
        s = difflib.SequenceMatcher(None, text, ref_text)
        self.assertGreater(s.ratio(), 0.9)

    def test_do_ocr_googleocr(self):
        """Test page._do_ocr(ocr_tool='googleOCR')."""
        error, text = self.page._do_ocr(ocr_tool='googleOCR')
        if error:
            self.skipTest(text)
        ref_error, ref_text = self.data['googleOCR']
        self.assertEqual(error, ref_error)
        s = difflib.SequenceMatcher(None, text, ref_text)
        self.assertGreater(s.ratio(), 0.9)

    def test_ocr_googleocr(self):
        """Test page.ocr(ocr_tool='googleOCR')."""
        try:
            text = self.page.ocr(ocr_tool='googleOCR')
        except Exception as exc:
            self.assertIsInstance(exc, ValueError)
        else:
            ref_error, ref_text = self.data['googleOCR']
            s = difflib.SequenceMatcher(None, text, ref_text)
            self.assertGreater(s.ratio(), 0.9)