예제 #1
0
 def test_version_error(self, popen):
     self.stdout.wait.return_value = 2
     popen.return_value = self.stdout
     with self.assertRaises(tesseract.TesseractError) as te:
         tesseract.get_version()
     self.assertEqual(te.exception.status, 2)
     self.assertEqual(te.exception.message, self.message)
예제 #2
0
 def test_version_error_nan(self, popen):
     message = self.message.replace("tesseract 4.0.0", "tesseract A.B.C")
     self.stdout.stdout.read.return_value = message.encode()
     popen.return_value = self.stdout
     with self.assertRaises(tesseract.TesseractError) as te:
         tesseract.get_version()
     self.assertEqual(te.exception.status, 0)
     self.assertIn("Unable to parse Tesseract version (not a number): ",
                   te.exception.message)
예제 #3
0
 def test_version_error_splitting(self, popen):
     tesseract.g_version = None  # drop cached version
     message = self.message.replace(b"tesseract 4.0.0", b"tesseract 3")
     self.stdout.stdout.read.return_value = message
     popen.return_value = self.stdout
     with self.assertRaises(tesseract.TesseractError) as te:
         tesseract.get_version()
     self.assertEqual(te.exception.status, 0)
     self.assertIn("Unable to parse Tesseract version (spliting failed): ",
                   te.exception.message)
예제 #4
0
    def test_version_cache(self, popen):
        """
        Make sure Tesseract is not called everytime we need the version.
        We need the version *often* in the code, and calling Tesseract
        everytime wouldn't be wise.
        """
        tesseract.g_version = None  # drop cached version

        self.stdout.stdout.read.return_value = self.message
        popen.return_value = self.stdout
        self.assertSequenceEqual(tesseract.get_version(), (4, 0, 0))

        self.stdout.stdout.read.return_value = "garbage"
        popen.return_value = self.stdout
        self.assertSequenceEqual(tesseract.get_version(), (4, 0, 0))
예제 #5
0
class TestContext(unittest.TestCase):
    """
    These tests make sure the requirements for the tests are met.
    """
    def setUp(self):
        pass

    def test_available(self):
        self.assertTrue(tesseract.is_available(),
                        "Tesseract not found. Is it installed ?")

    @unittest.skipIf(tesseract.get_version() != (3, 2, 1),
                     "This test only works with Tesseract 3.02.1")
    def test_version(self):
        self.assertEqual(tesseract.get_version(), (3, 2, 1),
                         ("Tesseract does not have the expected version"
                          " (3.02.1) ! Tests will fail !"))

    def test_langs(self):
        langs = tesseract.get_available_languages()
        self.assertTrue("eng" in langs,
                        ("English training does not appear to be installed."
                         " (required for the tests)"))
        self.assertTrue("fra" in langs,
                        ("French training does not appear to be installed."
                         " (required for the tests)"))
        self.assertTrue("jpn" in langs,
                        ("Japanese training does not appear to be installed."
                         " (required for the tests)"))

    def tearDown(self):
        pass
예제 #6
0
 def test_version_windows(self, popen):
     tesseract.g_version = None  # drop cached version
     message = self.message.replace(b"tesseract 4.0.0",
                                    b"tesseract v4.0.0.20181030")
     self.stdout.stdout.read.return_value = message
     popen.return_value = self.stdout
     self.assertSequenceEqual(tesseract.get_version(), (4, 0, 0))
예제 #7
0
 def test_version(self):
     self.assertTrue(tesseract.get_version() in (
         (3, 2, 1),
         (3, 2, 2),
         (3, 3, 0),
     ), ("Tesseract does not have the expected version"
         " (3.3.0) ! Some tests will be skipped !"))
예제 #8
0
class TestTxt(unittest.TestCase):
    """
    These tests make sure the "usual" OCR works fine. (the one generating
    a .txt file)
    """
    def setUp(self):
        pass

    def __test_txt(self, image_file, expected_output_file, lang='eng'):
        image_file = "tests/data/" + image_file
        expected_output_file = "tests/tesseract/" + expected_output_file

        expected_output = ""
        with codecs.open(expected_output_file, 'r', encoding='utf-8') \
                as file_descriptor:
            for line in file_descriptor:
                expected_output += line
        expected_output = expected_output.strip()

        output = tesseract.image_to_string(Image.open(image_file), lang=lang)

        self.assertEqual(output, expected_output)

    def test_basic(self):
        self.__test_txt('test.png', 'test.txt')

    @unittest.skipIf(tesseract.get_version() not in (
        (3, 2, 1),
        (3, 2, 2),
        (3, 3, 0),
    ), "This test only works with Tesseract 3.02.1")
    def test_european(self):
        self.__test_txt('test-european.jpg', 'test-european.txt')

    @unittest.skipIf(tesseract.get_version() not in (
        (3, 2, 1),
        (3, 2, 2),
        (3, 3, 0),
    ), "This test only works with Tesseract 3.02.1")
    def test_french(self):
        self.__test_txt('test-french.jpg', 'test-french.txt', 'fra')

    def test_japanese(self):
        self.__test_txt('test-japanese.jpg', 'test-japanese.txt', 'jpn')

    def tearDown(self):
        pass
예제 #9
0
 def test_version(self):
     self.assertTrue(tesseract.get_version() in (
         (3, 2, 1),
         (3, 2, 2),
         (3, 3, 0),
         (3, 4, 0),
     ), ("Tesseract does not have the expected version"
         " (3.4.0) ! Some tests will be skipped !"))
예제 #10
0
 def test_version(self):
     self.assertTrue(
         tesseract.get_version() in (
             (3, 2, 1),
             (3, 2, 2),
             (3, 3, 0),
             (3, 4, 0),
             (3, 4, 1),
             (3, 5, 0),
         ), ("Tesseract does not have the expected version"))
예제 #11
0
    def test_version_tesseract4(self, popen):
        tesseract.g_version = None  # drop cached version
        popen.return_value = self.stdout
        self.assertSequenceEqual(tesseract.get_version(), (4, 0, 0))

        # stderr must be explicitely ignored when calling 'tesseract -v'.
        # See https://gitlab.gnome.org/World/OpenPaperwork/pyocr/-/issues/118
        popen.assert_called_once()
        (args, kwargs) = popen.call_args
        self.assertNotIn('stderr', kwargs)
예제 #12
0
 def test_version(self):
     self.assertTrue(
         tesseract.get_version() in (
             (3, 2, 1),
             (3, 2, 2),
             (3, 3, 0),
             (3, 4, 0),
             (3, 4, 1),
             (3, 5, 0),
         ),
         ("Tesseract does not have the expected version")
     )
예제 #13
0
 def test_version_windows(self, popen):
     message = self.message.replace("tesseract 4.0.0",
                                    "tesseract v4.0.0.20181030")
     self.stdout.stdout.read.return_value = message.encode()
     popen.return_value = self.stdout
     self.assertSequenceEqual(tesseract.get_version(), (4, 0, 0))
예제 #14
0
 def test_version_tesseract3_no_minor(self, popen):
     message = self.message.replace("tesseract 4.0.0", "tesseract 3.0")
     self.stdout.stdout.read.return_value = message.encode()
     popen.return_value = self.stdout
     self.assertSequenceEqual(tesseract.get_version(), (3, 0, 0))
예제 #15
0
 def test_version_tesseract4(self, popen):
     popen.return_value = self.stdout
     self.assertSequenceEqual(tesseract.get_version(), (4, 0, 0))
예제 #16
0
class TestCharBox(unittest.TestCase):
    """
    These tests make sure that Tesseract box handling works fine.
    """
    def setUp(self):
        self.builder = tesseract.CharBoxBuilder()

    def __test_txt(self, image_file, expected_box_file, lang='eng'):
        image_file = "tests/data/" + image_file
        expected_box_file = "tests/tesseract/" + expected_box_file

        with codecs.open(expected_box_file, 'r', encoding='utf-8') \
                as file_descriptor:
            expected_boxes = self.builder.read_file(file_descriptor)
        expected_boxes.sort()

        boxes = tesseract.image_to_string(Image.open(image_file), lang=lang,
                                          builder=self.builder)
        boxes.sort()

        self.assertEqual(len(boxes), len(expected_boxes))

        for i in range(0, min(len(boxes), len(expected_boxes))):
            self.assertEqual(boxes[i], expected_boxes[i])

    def test_basic(self):
        self.__test_txt('test.png', 'test.box')

    def test_european(self):
        self.__test_txt('test-european.jpg', 'test-european.box')

    def test_french(self):
        self.__test_txt('test-french.jpg', 'test-french.box', 'fra')

    @unittest.skipIf(tesseract.get_version() not in (
        (3, 2, 1),
        (3, 2, 2),
        (3, 3, 0),
    ), "This test requires Tesseract 3.02.1")
    def test_japanese(self):
        self.__test_txt('test-japanese.jpg', 'test-japanese.box', 'jpn')

    def test_write_read(self):
        original_boxes = tesseract.image_to_string(
            Image.open("tests/data/test.png"), builder=self.builder)
        self.assertTrue(len(original_boxes) > 0)

        (file_descriptor, tmp_path) = tempfile.mkstemp()
        try:
            # we must open the file with codecs.open() for utf-8 support
            os.close(file_descriptor)

            with codecs.open(tmp_path, 'w', encoding='utf-8') as fdescriptor:
                self.builder.write_file(fdescriptor, original_boxes)

            with codecs.open(tmp_path, 'r', encoding='utf-8') as fdescriptor:
                new_boxes = self.builder.read_file(fdescriptor)

            self.assertEqual(len(new_boxes), len(original_boxes))
            for i in range(0, len(original_boxes)):
                self.assertEqual(new_boxes[i], original_boxes[i])
        finally:
            os.remove(tmp_path)

    def tearDown(self):
        pass
예제 #17
0
 def test_version_tesseract3_no_minor(self, popen):
     tesseract.g_version = None  # drop cached version
     message = self.message.replace(b"tesseract 4.0.0", b"tesseract 3.0")
     self.stdout.stdout.read.return_value = message
     popen.return_value = self.stdout
     self.assertSequenceEqual(tesseract.get_version(), (3, 0, 0))
예제 #18
0
 def test_version(self):
     self.assertEqual(tesseract.get_version(), (3, 2, 1),
                      ("Tesseract does not have the expected version"
                       " (3.02.1) ! Tests will fail !"))
예제 #19
0
 def test_version(self):
     self.assertEqual(tesseract.get_version(), (3, 2, 1),
                      ("Tesseract does not have the expected version"
                       " (3.02.1) ! Tests will fail !"))