def test_version(): e = ocr.Engine("") version = e.get_version() version_regex = re.compile("[0-9][.][0-9]{1,2}[.][0-9]{1,2}") print(version) assert version_regex.match(version)
def set_tess_path(cls, path: str = None) -> None: if path is None: path = locate_tessdata() assert path is not None assert os.path.exists(path) cls.engine = ocr.Engine(path) assert cls.engine is not None
class GenerateOCRFileTask(speedwagon.tasks.Subtask): engine = ocr.Engine(locate_tessdata()) def __init__(self, source_image, out_text_file, lang="eng", tesseract_path=None) -> None: super().__init__() self._source = source_image self._output_text_file = out_text_file self._lang = lang GenerateOCRFileTask.set_tess_path(tesseract_path or locate_tessdata()) assert self.engine is not None @classmethod def set_tess_path(cls, path=locate_tessdata()): assert path is not None assert os.path.exists(path) cls.engine = ocr.Engine(path) assert cls.engine is not None def work(self) -> bool: # Get the ocr text reader for the proper language reader = self.engine.get_reader(self._lang) self.log("Reading {}".format(os.path.normcase(self._source))) f = io.StringIO() with contextlib.redirect_stderr(f): # Capture the warning messages resulting_text = reader.read(self._source) stderr_messages = f.getvalue() if stderr_messages: # Log any error messages self.log(stderr_messages.strip()) # Generate a text file from the text data extracted from the image self.log("Writing to {}".format(self._output_text_file)) with open(self._output_text_file, "w", encoding="utf8") as wf: wf.write(resulting_text) result = {"text": resulting_text, "source": self._source} self.set_results(result) return True
def test_no_osd_file(tmpdir_factory): e = ocr.Engine("") version = e.get_version() english_data_url = "{}/{}/{}".format(TESSDATA_SOURCE_URL_BASE, "4.0.0", "eng.traineddata") tessdata_path = tmpdir_factory.mktemp("no_osd_tessdata", numbered=False) if not os.path.exists(tessdata_path): os.makedirs(tessdata_path) download_data(english_data_url, destination=tessdata_path) with pytest.raises(FileNotFoundError): reader = ocr.Reader(language_code="eng", tesseract_data_path=tessdata_path) shutil.rmtree(tessdata_path)
def set_tess_path(cls, path=locate_tessdata()): assert path is not None assert os.path.exists(path) cls.engine = ocr.Engine(path) assert cls.engine is not None
class GenerateOCRFileTask(speedwagon.tasks.Subtask): engine = ocr.Engine(locate_tessdata()) def __init__(self, source_image: str, out_text_file: str, lang: str = "eng", tesseract_path: str = None) -> None: super().__init__() self._source = source_image self._output_text_file = out_text_file self._lang = lang self._tesseract_path = tesseract_path GenerateOCRFileTask.set_tess_path(tesseract_path or locate_tessdata()) assert self.engine is not None @classmethod def set_tess_path(cls, path: str = None) -> None: if path is None: path = locate_tessdata() assert path is not None assert os.path.exists(path) cls.engine = ocr.Engine(path) assert cls.engine is not None def work(self) -> bool: resulting_text = self.read_image(self._source, self._lang) # Generate a text file from the text data extracted from the image self.log("Writing to {}".format(self._output_text_file)) with open(self._output_text_file, "w", encoding="utf8") as write_file: write_file.write(resulting_text) result = { "text": resulting_text, "source": self._source } self.set_results(result) return True def read_image(self, file: str, lang: str) -> str: if self.engine.data_set_path is None: self.engine.data_set_path = self._tesseract_path # Get the ocr text reader for the proper language reader = self.engine.get_reader(lang) self.log("Reading {}".format(os.path.normcase(file))) file_handle = io.StringIO() with contextlib.redirect_stderr(file_handle): # Capture the warning messages try: resulting_text = reader.read(file) except ocr.tesseractwrap.TesseractGlueException as error: raise SpeedwagonException(f"Unable to read {file}") from error stderr_messages = file_handle.getvalue() if stderr_messages: # Log any error messages self.log(stderr_messages.strip()) return resulting_text
def ocr_engine(tess_path): tess_engine = ocr.Engine(tess_path) return tess_engine