def test_line_error(self, popen): message = ("Cuneiform for Linux 1.1.0\n" "Magick: Improper image header (example.png) reported by " "coders/png.c:2932 (ReadPNGImage)\n") self.stdout.stdout.read.return_value = message.encode() self.stdout.wait.return_value = 1 popen.return_value = self.stdout with self.assertRaises(cuneiform.CuneiformError) as ce: cuneiform.image_to_string(self.image, builder=self.builder) self.assertEqual(ce.exception.status, 1) self.assertEqual(ce.exception.message, message)
def test_write_read(self): original_boxes = cuneiform.image_to_string( Image.open( os.path.join("tests", "input", "specific", "test.png") ), builder=self.builder ) self.assertTrue(len(original_boxes) > 0) (file_descriptor, tmp_path) = tempfile.mkstemp() try: # we must open the file with codecs.open() for utf-8 support os.close(file_descriptor) with codecs.open(tmp_path, 'w', encoding='utf-8') as file_descriptor: self.builder.write_file(file_descriptor, original_boxes) with codecs.open(tmp_path, 'r', encoding='utf-8') as file_descriptor: new_boxes = self.builder.read_file(file_descriptor) self.assertEqual(len(new_boxes), len(original_boxes)) for i in range(0, len(original_boxes)): self.assertEqual(new_boxes[i], original_boxes[i]) finally: os.remove(tmp_path)
def __test_txt(self, image_file, expected_box_file, lang='eng'): image_file = os.path.join("tests", "input", "specific", image_file) expected_box_file = os.path.join( "tests", "output", "specific", "cuneiform", expected_box_file ) with codecs.open(expected_box_file, 'r', encoding='utf-8') \ as file_descriptor: expected_boxes = self.builder.read_file(file_descriptor) expected_boxes.sort() boxes = cuneiform.image_to_string(Image.open(image_file), lang=lang, builder=self.builder) boxes.sort() self.assertEqual(len(boxes), len(expected_boxes)) for i in range(0, min(len(boxes), len(expected_boxes))): try: # Python 2.7 self.assertEqual(type(expected_boxes[i].content), unicode) self.assertEqual(type(boxes[i].content), unicode) except NameError: # Python 3.x self.assertEqual(type(expected_boxes[i].content), str) self.assertEqual(type(boxes[i].content), str) self.assertEqual(boxes[i], expected_boxes[i])
def __test_txt(self, image_file, expected_box_file, lang='eng'): image_file = "tests/data/" + image_file expected_box_file = "tests/cuneiform/" + expected_box_file with codecs.open(expected_box_file, 'r', encoding='utf-8') \ as file_descriptor: expected_boxes = self.builder.read_file(file_descriptor) expected_boxes.sort() boxes = cuneiform.image_to_string(Image.open(image_file), lang=lang, builder=self.builder) boxes.sort() self.assertEqual(len(boxes), len(expected_boxes)) for i in range(0, min(len(boxes), len(expected_boxes))): try: # Python 2.7 self.assertEqual(type(expected_boxes[i].content), unicode) self.assertEqual(type(boxes[i].content), unicode) except NameError: # Python 3.x self.assertEqual(type(expected_boxes[i].content), str) self.assertEqual(type(boxes[i].content), str) self.assertEqual(boxes[i], expected_boxes[i])
def test_text(self, popen, copen, temp_file): popen.return_value = self.stdout copen.return_value = self.text_file temp_file.return_value = self.enter output = cuneiform.image_to_string(self.image, builder=self.builder) self.assertEqual(output, self._get_file_content("text").strip()) popen.assert_called_once_with( ["cuneiform", "-f", "text", "-o", self.tmp_filename, "-"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
def test_line(self, popen, copen, temp_file): popen.return_value = self.stdout copen.return_value = self.text_file temp_file.return_value = self.enter output = cuneiform.image_to_string(self.image, builder=self.builder) popen.assert_called_once_with( ["cuneiform", "-f", "hocr", "-o", self.tmp_filename, "-"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) for box in output: self.assertIsInstance(box, builders.LineBox)
def test_image_to_string_defaults_to_text_buidler(self, popen, copen, temp_file, get_version): get_version.return_value = (4, 0, 0) popen.return_value = self.stdout copen.return_value = self.text_file temp_file.return_value = self.enter output = cuneiform.image_to_string(self.image) self.assertEqual(output, self._get_file_content("text").strip()) popen.assert_called_once_with( ["cuneiform", "-f", "text", "-o", self.tmp_filename, "-"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
def __test_txt(self, image_file, expected_output_file, lang="eng"): image_file = "tests/data/" + image_file expected_output_file = "tests/cuneiform/" + expected_output_file expected_output = "" with codecs.open(expected_output_file, "r", encoding="utf-8") as file_descriptor: for line in file_descriptor: expected_output += line expected_output = expected_output.strip() output = cuneiform.image_to_string(Image.open(image_file), lang=lang) self.assertEqual(output, expected_output)
def __test_txt(self, image_file, expected_output_file, lang='eng'): image_file = "tests/data/" + image_file expected_output_file = "tests/cuneiform/" + expected_output_file expected_output = "" with codecs.open(expected_output_file, 'r', encoding='utf-8') \ as file_descriptor: for line in file_descriptor: expected_output += line expected_output = expected_output.strip() output = cuneiform.image_to_string(Image.open(image_file), lang=lang) self.assertEqual(output, expected_output)
def test_text_non_rgb_image(self, popen, copen, temp_file): """This tests that image_to_string works with non RGB mode images and that image is converted in function.""" image = self.image.convert("L") popen.return_value = self.stdout copen.return_value = self.text_file temp_file.return_value = self.enter output = cuneiform.image_to_string(image, builder=self.builder) self.assertEqual(output, self._get_file_content("text").strip()) popen.assert_called_once_with( ["cuneiform", "-f", "text", "-o", self.tmp_filename, "-"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
def __test_txt(self, image_file, expected_output_file, lang='eng'): image_file = os.path.join("tests", "input", "specific", image_file) expected_output_file = os.path.join("tests", "output", "specific", "cuneiform", expected_output_file) expected_output = "" with codecs.open(expected_output_file, 'r', encoding='utf-8') \ as file_descriptor: for line in file_descriptor: expected_output += line expected_output = expected_output.strip() output = cuneiform.image_to_string(Image.open(image_file), lang=lang) self.assertEqual(output, expected_output)
def __test_txt(self, image_file, expected_output_file, lang='eng'): image_file = os.path.join("tests", "input", "specific", image_file) expected_output_file = os.path.join( "tests", "output", "specific", "cuneiform", expected_output_file ) expected_output = "" with codecs.open(expected_output_file, 'r', encoding='utf-8') \ as file_descriptor: for line in file_descriptor: expected_output += line expected_output = expected_output.strip() output = cuneiform.image_to_string(Image.open(image_file), lang=lang) self.assertEqual(output, expected_output)
def extract_name_from_image(self, cropped_chunk, counter): threshold_names = 120 """ Для имен еще надо: - отрезать слева иконку и класс - попытаться примерно угадать, где вторая строка с уровнем и званием, и удалить её тоже """ image_with_name = self.get_part_with_name(cropped_chunk) # cut icon and class imgwidth, imgheight = image_with_name.size percentage_value_x = 0.35 percentage_value_y = 0.5 newwidth = floor(imgwidth * percentage_value_x) newheight = floor(imgheight * percentage_value_y) box = (newwidth, 0, floor(imgwidth * 0.8), newheight) image_with_name = image_with_name.crop(box) converted_names = image_with_name # converted_names = grayscale(image_with_name, threshold_names) # converted_names = add_border(converted_names) cropped_img = Image.new('RGB', converted_names.size, 255) cropped_img.paste(converted_names) # cropped_img.save('chunk-{}-{}'.format(counter, threshold_names) + '.png') try: # cuneiform line_and_word_boxes = image_to_string( converted_names, lang='eng' # builder=pyocr.builders.LineBoxBuilder() ) text_data_cunei = line_and_word_boxes # for lb in line_and_word_boxes: # print('LB', lb.content) except pyocr.error.CuneiformError: text_data_cunei = None pass # text_data = merge_nearest_chars(line_and_word_boxes) text_data_tesseract = pytesseract.image_to_string(converted_names) return {'tesseract': text_data_tesseract, 'cunei': text_data_cunei}
def extract_numbers_from_image(self, cropped_chunk, image_name=None): """ Работает через cuneiForm. :param cropped_chunk: :param image_name: :return: """ threshold_numbers = 70 converted_numbers = self.get_part_with_numbers(cropped_chunk) converted_numbers = self.grayscale(converted_numbers, threshold_numbers) converted_numbers = self.add_border(converted_numbers) if image_name is not None and self.debug: converted_numbers.save('debug_images/' + 'extracted_numbers-' + image_name) line_and_word_boxes = image_to_string( converted_numbers, lang='ruseng', builder=pyocr.builders.LineBoxBuilder() ) text_numbers_data = self.merge_nearest_chars(line_and_word_boxes) return text_numbers_data
def test_write_read(self): original_boxes = cuneiform.image_to_string( Image.open("tests/data/test.png"), builder=self.builder) self.assertTrue(len(original_boxes) > 0) (file_descriptor, tmp_path) = tempfile.mkstemp() try: # we must open the file with codecs.open() for utf-8 support os.close(file_descriptor) with codecs.open(tmp_path, 'w', encoding='utf-8') as file_descriptor: self.builder.write_file(file_descriptor, original_boxes) with codecs.open(tmp_path, 'r', encoding='utf-8') as file_descriptor: new_boxes = self.builder.read_file(file_descriptor) self.assertEqual(len(new_boxes), len(original_boxes)) for i in range(0, len(original_boxes)): self.assertEqual(new_boxes[i], original_boxes[i]) finally: os.remove(tmp_path)
def __test_txt(self, image_file, expected_box_file, lang="eng"): image_file = "tests/data/" + image_file expected_box_file = "tests/cuneiform/" + expected_box_file with codecs.open(expected_box_file, "r", encoding="utf-8") as file_descriptor: expected_boxes = self.builder.read_file(file_descriptor) expected_boxes.sort() boxes = cuneiform.image_to_string(Image.open(image_file), lang=lang, builder=self.builder) boxes.sort() self.assertEqual(len(boxes), len(expected_boxes)) for i in range(0, min(len(boxes), len(expected_boxes))): try: # Python 2.7 self.assertEqual(type(expected_boxes[i].content), unicode) self.assertEqual(type(boxes[i].content), unicode) except NameError: # Python 3.x self.assertEqual(type(expected_boxes[i].content), str) self.assertEqual(type(boxes[i].content), str) self.assertEqual(boxes[i], expected_boxes[i])
def test_digits_box_not_implemented(self): with self.assertRaises(NotImplementedError): cuneiform.image_to_string(self.image, builder=self.builder)