def test_init_tesseract_version_4(self, get_version): get_version.return_value = (4, 0, 0) builder = builders.LineBoxBuilder() self.assertListEqual(builder.tesseract_flags, ["--psm", "1"]) self.assertListEqual(builder.file_extensions, ["html", "hocr"]) self.assertListEqual(builder.tesseract_configs, ["hocr"]) self.assertListEqual(builder.cuneiform_args, ["-f", "hocr"]) self.assertListEqual(builder.lines, []) self.assertEqual(builder.tesseract_layout, 1)
def ocr(tool, img, cont='txt'): # img is [0,1] with float64 and single channel # langs = tool.get_available_languages() # lang = langs[0] if cont == 'txt': # txt is a Python string txt = tool.image_to_string( Image.fromarray((img * 255.0).astype('uint8'), mode='L'), lang="eng", builder=ocrtools.TextBuilder() ) return txt if cont == 'word_boxes': # list of box objects. For each box object: # box.content is the word in the box # box.position is its position on the page (in pixels) # # Beware that some OCR tools (Tesseract for instance) # may return empty boxes word_boxes = tool.image_to_string( Image.fromarray((img * 255.0).astype('uint8'), mode='L'), lang="eng", builder=ocrtools.WordBoxBuilder() ) return word_boxes if cont == 'line_word_boxes': # list of line objects. For each line object: # line.word_boxes is a list of word boxes (the individual words in the line) # line.content is the whole text of the line # line.position is the position of the whole line on the page (in pixels) # # Beware that some OCR tools (Tesseract for instance) # may return empty boxes line_and_word_boxes = tool.image_to_string( Image.fromarray((img * 255.0).astype('uint8'), mode='L'), lang="eng", builder=ocrtools.LineBoxBuilder() ) return line_and_word_boxes # if cont == 'digits': # # Digits - Only Tesseract (not 'libtesseract' yet !) # # digits is a python string # digits = tool.image_to_string( # Image.fromarray((img * 255.0).astype('uint8'), mode='L'), # lang="eng", # builder=pyocr.tesseract.DigitBuilder() # ) # return digits else: raise ValueError(" Not supported OCR type ")
def setUp(self, get_version): get_version.return_value = (4, 0, 0) self.builder = builders.LineBoxBuilder() self.image = Image.new(mode="RGB", size=(1, 1)) self.text_file = StringIO(self._get_file_content("cuneiform.lines")) self.stdout = MagicMock() self.stdout.stdout.read.return_value = b"Cuneiform for Linux 1.1.0\n" self.stdout.wait.return_value = 0 self.tmp_filename = "/tmp/cuneiform_n0qfk87otxt" self.enter = MagicMock() self.enter.__enter__.return_value = MagicMock() self.enter.__enter__.return_value.configure_mock( name=self.tmp_filename)
def setUp(self): self.builder = builders.LineBoxBuilder()
def set_builder(self): self._builder = builders.LineBoxBuilder()
def setUp(self, get_version): get_version.return_value = (4, 0, 0) self.image = Image.new(mode="RGB", size=(1, 1)) self.builder = builders.LineBoxBuilder()
def setUp(self, get_version): get_version.return_value = (4, 0, 0) self.builder = builders.LineBoxBuilder()